In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

spark = SparkSession.builder.appName('Linear_regression_sample').getOrCreate()
data = spark.read.csv('Ecommerce_Customers.csv', header=True, inferSchema=True)
data.show(3)

+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|   Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|   Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|   Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
+--------------------+--------------------+---------+------------------+----------------

In [3]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [10]:
for item in data.take(2)[1]:
    print(item)

hduke@hotmail.com
4547 Archer CommonDiazchester, CA 06566-8576
DarkGreen
31.92627202636016
11.109460728682564
37.268958868297744
2.66403418213262
392.2049334443264


In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [13]:
assembler = VectorAssembler(inputCols = ['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], outputCol = 'features')
output = assembler.transform(data)
output.take(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [14]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
final_data = output.select(['features', 'Yearly Amount Spent'])
final_data.show(3)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
+--------------------+-------------------+
only showing top 3 rows



In [16]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [18]:
train, test = final_data.randomSplit([0.7, 0.3])
train.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                344|
|   mean| 501.56891944874263|
| stddev|  80.05283936790765|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [19]:
test.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                156|
|   mean|  494.3417361469743|
| stddev|  77.68504211687474|
|    min| 256.67058229005585|
|    max|  700.9170916173961|
+-------+-------------------+



In [20]:
lr = LinearRegression(labelCol = 'Yearly Amount Spent')
lr_model = lr.fit(train)

In [22]:
test_result = lr_model.evaluate(test)
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  9.920816513034879|
|  10.70091171045658|
| -4.052157537914752|
|  4.197582213270721|
|-13.117617411562094|
| -7.510566592645546|
|  22.01350655518786|
|  18.56111122682347|
| 3.5394008658557823|
|0.23861684488724677|
| 2.8285428337596272|
| -5.705837609900527|
| -4.489310758540171|
| -5.753400234926289|
|  3.839466127812159|
|-2.1099929840607956|
|  -4.16714747346856|
| -26.51752993610296|
| 2.0634824112656247|
|-4.8095343970560975|
+-------------------+
only showing top 20 rows



In [23]:
test_result.rootMeanSquaredError

9.935028196912466

In [24]:
test_result.r2

0.983538996881317

In [25]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [27]:
unlabeled_data = test.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[30.7377203726281...|
|[30.8364326747734...|
|[31.0472221394875...|
|[31.0662181616375...|
|[31.1280900496166...|
|[31.2834474760581...|
|[31.3123495994443...|
|[31.3662121671876...|
|[31.3895854806643...|
|[31.4459724827577...|
|[31.5147378578019...|
|[31.5171218025062...|
|[31.5257524169682...|
|[31.5316044825729...|
|[31.5761319713222...|
|[31.6253601348306...|
|[31.6739155032749...|
|[31.7366356860502...|
|[31.7656188210424...|
+--------------------+
only showing top 20 rows



In [29]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...| 398.7195345595926|
|[30.7377203726281...| 451.0798304857733|
|[30.8364326747734...|471.55405796490436|
|[31.0472221394875...| 388.2998169757507|
|[31.0662181616375...|462.05091061923645|
|[31.1280900496166...| 564.7632533397002|
|[31.2834474760581...| 569.7675828704796|
|[31.3123495994443...|445.03030680111715|
|[31.3662121671876...|427.04948169062914|
|[31.3895854806643...|409.83099421509564|
|[31.4459724827577...|482.04842210136894|
|[31.5147378578019...|495.51832560636194|
|[31.5171218025062...| 280.4077314089259|
|[31.5257524169682...| 449.7190270448082|
|[31.5316044825729...| 432.6761396015504|
|[31.5761319713222...| 543.3365769733891|
|[31.6253601348306...|380.50404823039275|
|[31.6739155032749...|502.24259784598416|
|[31.7366356860502...| 494.8699638442663|
|[31.7656188210424...|501.36361603266323|
+--------------------+------------