In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Customers').getOrCreate()

In [25]:
from pyspark.ml.regression import LinearRegression

In [3]:
# infers the input schema automatically from data. It requires one extra pass over the data. 
#If None is set, it uses the default value, ``false`
dataset =  spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)

In [4]:
dataset

DataFrame[Email: string, Address: string, Avg Session Length: double, Time on App: double, Time on Website: double, Length of Membership: double, Yearly Amount Spent: double]

In [6]:
dataset.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [7]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [8]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [10]:
featureassembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='Independent Feature')

In [13]:
output = featureassembler.transform(dataset)

In [14]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent| Independent Feature|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [16]:
output.select('Independent Feature').show()

+--------------------+
| Independent Feature|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [17]:
output.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'Independent Feature']

In [18]:
# Independent Feature is combination of all independent vectors
# pyspark require independent and dependent vectors to make prediction
# Yearly Amount Spent is our Y variable
final_data = output.select('Independent Feature','Yearly Amount Spent')

In [20]:
final_data.show(5)

+--------------------+-------------------+
| Independent Feature|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
+--------------------+-------------------+
only showing top 5 rows



In [23]:
train_data,test_data  =  final_data.randomSplit([0.75,0.25])

In [26]:
regressor = LinearRegression(featuresCol='Independent Feature',labelCol='Yearly Amount Spent')
regressor = regressor.fit(train_data)

In [27]:
regressor.coefficients

DenseVector([25.953, 38.3924, 0.7528, 61.4128])

In [28]:
regressor.intercept

-1066.1902807254303

In [32]:
pred = regressor.evaluate(test_data)

In [34]:
#Prediction result
pred.predictions.show()

+--------------------+-------------------+------------------+
| Independent Feature|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.39318454,11.8...|        319.9288698| 331.0629428769414|
|[30.4925367,11.56...|        282.4712457| 287.1873393471144|
|[30.81620065,11.8...|        266.0863409|282.99339860163695|
|[30.83643267,13.1...|        467.5019004| 470.5322822891387|
|[31.06621816,11.7...|        448.9332932| 461.2910866721188|
|[31.28344748,12.7...|        591.7810894|  568.434904686809|
|[31.30919264,11.9...|        432.7207178| 429.1655979316208|
|[31.36621217,11.1...|        430.5888826|426.69644863573035|
|[31.44744649,10.1...|        418.6027421| 426.7187990506013|
|[31.60983957,12.7...|        444.5455497|426.79281675874654|
|[31.62536013,13.1...|        376.3369008|380.57476181789593|
|[31.81248256,10.8...|         392.810345| 395.8263149288555|
|[31.85125313,12.4...|        472.9922467|464.02423779276796|
|[31.862