In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('rl').getOrCreate()

In [6]:
from pyspark.ml.regression import LinearRegression

In [7]:
training = spark.read.format('libsvm').load('sample_linear_regression_data.txt', header = True)

In [8]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [9]:
lr = LinearRegression (featuresCol='features', labelCol='label', predictionCol='predection')

In [10]:
lrModel = lr.fit(training)

In [11]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [12]:
lrModel.intercept

0.14228558260358093

In [13]:
training_summary = lrModel.summary

In [14]:
all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [15]:
train_d, test_d = all_data.randomSplit([0.7, 0.3])

In [16]:
train_d.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                344|
|   mean| 0.4492196188413845|
| stddev| 10.052156625218025|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [17]:
model = lr.fit(train_d)

In [18]:
test_results = model.evaluate(test_d)

In [19]:
unlabeled_d = test_d.select('features')

In [20]:
unlabeled_d.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [21]:
predect = model.transform(unlabeled_d)

In [22]:
predect.show()

+--------------------+--------------------+
|            features|          predection|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  1.9670777223982818|
|(10,[0,1,2,3,4,5,...| -0.6610004964488947|
|(10,[0,1,2,3,4,5,...|   3.390014230857974|
|(10,[0,1,2,3,4,5,...| -1.9182802795237226|
|(10,[0,1,2,3,4,5,...|  1.6670486102381286|
|(10,[0,1,2,3,4,5,...|  1.6915875755725533|
|(10,[0,1,2,3,4,5,...|   2.093574912129612|
|(10,[0,1,2,3,4,5,...|  1.7343036419340963|
|(10,[0,1,2,3,4,5,...| -3.3425460011488317|
|(10,[0,1,2,3,4,5,...|  1.0700052741512651|
|(10,[0,1,2,3,4,5,...|  1.9333026825749045|
|(10,[0,1,2,3,4,5,...|  0.7982226800178851|
|(10,[0,1,2,3,4,5,...|  2.4468350962216916|
|(10,[0,1,2,3,4,5,...|-0.40917460989751564|
|(10,[0,1,2,3,4,5,...|  3.5089514545041305|
|(10,[0,1,2,3,4,5,...|  -2.375831653187788|
|(10,[0,1,2,3,4,5,...|  1.1814517016165735|
|(10,[0,1,2,3,4,5,...| 0.28946619980992383|
|(10,[0,1,2,3,4,5,...|   5.363918158671481|
|(10,[0,1,2,3,4,5,...| -2.162656

In [23]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RLR").getOrCreate()

In [24]:
from pyspark.ml.regression import LinearRegression

In [25]:
data = spark.read.csv('EcommerceCustomers.csv', header = True, inferSchema = True)

In [26]:
data.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank Tunnel', Avatar=None, Avg Session Length=None, Time on App=None, Time on Website=None, Length of Membership=None, Yearly Amount Spent=None)]

In [27]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [28]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [29]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [30]:
data = data.na.drop()

In [31]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
                           outputCol='features')

In [32]:
output = assembler.transform(data)

In [33]:
output.head(1)

[Row(Email='Wrightmouth', Address=' MI 82180-9605"', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.655651149166752, Time on Website=39.57766801952616, Length of Membership=4.082620632952961, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [34]:
final_data = output.select('features', 'Yearly Amount Spent')

In [35]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352178|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[33.9877728956856...|  570.2004089636195|
|[33.9925727749537...|  492.6060127179966|
|[29.5324289670579...| 408.64035107262754|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...| 470.45273330095546|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...|  407.7045475495441|
|[32.1878120459321...| 452.31567548003545|
|[32.6178560628234...|   605.061038804892|
|[32.9127851111597...|  534.7057438060227|
|[34.5075509985266...|  700.9170916173961|
|[33.0293319535068...| 423.17999168059777|
+----------

In [36]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [37]:
lrr = LinearRegression(labelCol='Yearly Amount Spent')

In [38]:
lrr_model = lrr.fit(train_data)

In [39]:
test_results = lrr_model.evaluate(test_data)

In [40]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-10.788531302129172|
|  7.197065198294922|
|  4.224703056631029|
| -5.783344877192235|
|-12.665330441539027|
| -21.00999407635902|
|   22.2939539101219|
| -4.508065987085331|
|  17.51115824653573|
| -4.362832836984865|
| -6.004429716250797|
| -5.211340473967141|
|-2.6307848237214557|
| 11.951629167369049|
|  5.099542290256238|
| -8.557804237850121|
|  4.810520026571908|
|  17.00484447346605|
| 22.083919031992593|
|-11.745503543653115|
+-------------------+
only showing top 20 rows



In [41]:
test_results.rootMeanSquaredError

10.857710036904377

In [42]:
test_results.r2

0.9819156286837568

In [43]:
unlabe_data = test_data.select('features')

In [44]:
unlabe_data.show()

+--------------------+
|            features|
+--------------------+
|[30.3931845423455...|
|[30.9716756438877...|
|[31.0472221394875...|
|[31.0613251567161...|
|[31.0662181616375...|
|[31.1239743499119...|
|[31.2834474760581...|
|[31.5171218025062...|
|[31.6098395733896...|
|[31.6253601348306...|
|[31.7242025238451...|
|[31.7656188210424...|
|[31.8124825597242...|
|[31.9262720263601...|
|[31.9480174211613...|
|[32.0085045178551...|
|[32.0478009788678...|
|[32.0478146331398...|
|[32.0498393904573...|
|[32.0637746203137...|
+--------------------+
only showing top 20 rows



In [45]:
pred = lrr_model.transform(unlabe_data)

In [46]:
pred.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...| 330.7174011053228|
|[30.9716756438877...| 487.4415445585978|
|[31.0472221394875...|388.27269613239037|
|[31.0613251567161...| 493.3388029350938|
|[31.0662181616375...| 461.5986236492133|
|[31.1239743499119...| 507.9570479161248|
|[31.2834474760581...| 569.4871355155456|
|[31.5171218025062...|280.42648663747104|
|[31.6098395733896...| 427.0343914045725|
|[31.6253601348306...|380.69973359390906|
|[31.7242025238451...|509.39231700421124|
|[31.7656188210424...| 501.7654221095743|
|[31.8124825597242...| 395.4411298075188|
|[31.9262720263601...|380.25330427695735|
|[31.9480174211613...| 456.8213346026416|
|[32.0085045178551...|451.75502526660557|
|[32.0478009788678...|508.64005115952455|
|[32.0478146331398...| 480.3847132853773|
|[32.0498393904573...| 456.6354378422227|
|[32.0637746203137...|390.07641045045693|
+--------------------+------------