In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('dodo').getOrCreate()

In [4]:
df = spark.read.csv('hdfs://localhost:9000/Python/people2.csv',inferSchema=True,header=True)

In [5]:
df.show(5)

+----------+------+-----------+------+-------------------+-------+-------+------+------+-------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| people_id|char_1|    group_1|char_2|               date| char_3| char_4|char_5|char_6| char_7|char_8|char_9|char_10|char_11|char_12|char_13|char_14|char_15|char_16|char_17|char_18|char_19|char_20|char_21|char_22|char_23|char_24|char_25|char_26|char_27|char_28|char_29|char_30|char_31|char_32|char_33|char_34|char_35|char_36|char_37|char_38|
+----------+------+-----------+------+-------------------+-------+-------+------+------+-------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+----

In [6]:
df.printSchema()

root
 |-- people_id: string (nullable = true)
 |-- char_1: string (nullable = true)
 |-- group_1: string (nullable = true)
 |-- char_2: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- char_3: string (nullable = true)
 |-- char_4: string (nullable = true)
 |-- char_5: string (nullable = true)
 |-- char_6: string (nullable = true)
 |-- char_7: string (nullable = true)
 |-- char_8: string (nullable = true)
 |-- char_9: string (nullable = true)
 |-- char_10: boolean (nullable = true)
 |-- char_11: boolean (nullable = true)
 |-- char_12: boolean (nullable = true)
 |-- char_13: boolean (nullable = true)
 |-- char_14: boolean (nullable = true)
 |-- char_15: boolean (nullable = true)
 |-- char_16: boolean (nullable = true)
 |-- char_17: boolean (nullable = true)
 |-- char_18: boolean (nullable = true)
 |-- char_19: boolean (nullable = true)
 |-- char_20: boolean (nullable = true)
 |-- char_21: boolean (nullable = true)
 |-- char_22: boolean (nullable = true)
 |-- char_23: 

In [7]:
from pyspark.ml.regression import LinearRegression

In [8]:
training = spark.read.format('libsvm').load('hdfs://localhost:9000/Python/sample_linear_regression_data.txt')

In [9]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [10]:
training.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [11]:
training.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                501|
|   mean|0.25688882219498976|
| stddev| 10.317884030544564|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [12]:
lr = LinearRegression(featuresCol='features',labelCol='label',predictionCol='prediction')

In [13]:
lrmodel = lr.fit(training)

In [14]:
lrmodel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [15]:
lrmodel.intercept

0.14228558260358093

In [16]:
training_summary = lrmodel.summary

In [17]:
training_summary.rootMeanSquaredError

10.16309157133015

In [18]:
#splitting

In [19]:
all_data = spark.read.format('libsvm').load('hdfs://localhost:9000/Python/sample_linear_regression_data.txt')

In [24]:
train, test = all_data.randomSplit([0.7, 0.3])

In [29]:
train.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                331|
|   mean| -0.435887582296799|
| stddev| 10.077456333344175|
|    min|-28.046018037776633|
|    max| 26.903524792043335|
+-------+-------------------+



In [30]:
correct_model = lr.fit(train)

In [31]:
test_result = correct_model.evaluate(test)

In [33]:
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-29.598604265419972|
|-27.565841847668747|
|-22.716528534415325|
| -20.79513510802338|
| -21.00987272984856|
| -20.14328326746602|
|-13.864185805011873|
|-18.206390719694134|
|-17.935029605052186|
|-12.945150876663947|
|-13.417112885767516|
| -17.07627972351877|
|-14.246864016291372|
|-14.084604424143985|
|-13.166162136197915|
|  -11.8425504177029|
|-10.177733096239225|
|-12.328552464859245|
|-11.057783721067905|
| -14.46926540675872|
+-------------------+
only showing top 20 rows



In [34]:
test_result.rootMeanSquaredError

10.879394942274107

In [35]:
unlabeled_data = test.select('features')

In [36]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [37]:
prediction = correct_model.transform(unlabeled_data)

In [39]:
prediction.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  1.0271253956765463|
|(10,[0,1,2,3,4,5,...|  0.7603584191856749|
|(10,[0,1,2,3,4,5,...| -0.7943555559076456|
|(10,[0,1,2,3,4,5,...| -2.6923050129131303|
|(10,[0,1,2,3,4,5,...|  1.1368816917801534|
|(10,[0,1,2,3,4,5,...| 0.36052047785148417|
|(10,[0,1,2,3,4,5,...|   -3.63001455187147|
|(10,[0,1,2,3,4,5,...|  2.0550413684170232|
|(10,[0,1,2,3,4,5,...|   2.073020277481627|
|(10,[0,1,2,3,4,5,...| -2.3896166032583945|
|(10,[0,1,2,3,4,5,...| -1.8938677036487737|
|(10,[0,1,2,3,4,5,...|  2.0197967489763364|
|(10,[0,1,2,3,4,5,...| -0.5752888934598176|
|(10,[0,1,2,3,4,5,...|-0.24437408493145857|
|(10,[0,1,2,3,4,5,...| -0.7009257589608522|
|(10,[0,1,2,3,4,5,...|-0.35554614695851183|
|(10,[0,1,2,3,4,5,...| -1.9526201160487047|
|(10,[0,1,2,3,4,5,...| 0.23420118632398573|
|(10,[0,1,2,3,4,5,...| -0.7995666443615203|
|(10,[0,1,2,3,4,5,...|  2.853490