## 1. Import and get spark instance

In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
spark = SparkSession.builder.appName('LR').getOrCreate()

## 2. Get data

In [15]:
df = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)
# df.show()

In [16]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [17]:
print(f'shape: rows={df.count()}, cols={len(df.columns)}')

shape: rows=500, cols=8


In [19]:
df.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

### Show some data

In [18]:
for row in df.head(3):
    print('\n', row)


 Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

 Row(Email='hduke@hotmail.com', Address='4547 Archer CommonDiazchester, CA 06566-8576', Avatar='DarkGreen', Avg Session Length=31.92627202636016, Time on App=11.109460728682564, Time on Website=37.268958868297744, Length of Membership=2.66403418213262, Yearly Amount Spent=392.2049334443264)

 Row(Email='pallen@yahoo.com', Address='24645 Valerie Unions Suite 582Cobbborough, DC 99414-7564', Avatar='Bisque', Avg Session Length=33.000914755642675, Time on App=11.330278057777512, Time on Website=37.110597442120856, Length of Membership=4.104543202376424, Yearly Amount Spent=487.54750486747207)


## 3. Make features column for MLlib

In [20]:
# A feature transformer that merges multiple columns into a vector column.
assembler = VectorAssembler(
    inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], 
    outputCol='features')
output = assembler.transform(df)

In [21]:
output.printSchema()  # having a "features" column

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [23]:
# To see a features at a row more clearly
output.head(1)
# Note we make 'features' from 4 columns, as a result it has 4 values in a vector
# and it is just a combination of those 4 columns

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

## 4. Specify features and label and training, test spliting

In [24]:
final_data = output.select(['features', 'Yearly Amount Spent'])

In [25]:
final_data.show(5)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



#### training, test split

In [26]:
training_data, test_data = final_data.randomSplit([0.8, 0.2]) # 80% for training, 20% for testing

In [28]:
training_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                395|
|   mean| 499.37152426781125|
| stddev|   81.1713888402499|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [29]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                105|
|   mean|  499.0977813667611|
| stddev|  72.26655089834281|
|    min| 352.55010816300035|
|    max|  689.7876041747194|
+-------+-------------------+



## 5. Linear Regression model

In [30]:
lr = LinearRegression(featuresCol='features',
                      labelCol='Yearly Amount Spent',
                      predictionCol='prediction')

#### Model

In [32]:
%%time
lr_model = lr.fit(dataset=training_data)

CPU times: user 4.72 ms, sys: 7.68 ms, total: 12.4 ms
Wall time: 981 ms


#### Evaluation

In [33]:
test_results = lr_model.evaluate(dataset=test_data)

In [36]:
test_results.residuals.show(5)  # residual = y - y_hat

+------------------+
|         residuals|
+------------------+
|10.414324417496346|
|3.7087982374620196|
|-3.299961487966357|
|-8.004029656211856|
| 4.928775095236972|
+------------------+
only showing top 5 rows



In [38]:
test_results.rootMeanSquaredError  # is it good or bad? refer to describes()

9.337785825356201

In [40]:
final_data.describe().show()  # mean=499.3, sigma=79.3 --> rootMeanSquaredError=9.33 is good

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [39]:
test_results.r2

0.9831434333496506

## 6. Deploy with unlabeled data

In [41]:
# To quickly test - just get features data from test set
unlabeled_data = test_data.select('features')
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[31.3662121671876...|
|[31.4252268808548...|
|[31.4474464941278...|
|[31.5316044825729...|
+--------------------+
only showing top 5 rows



#### prediction

In [43]:
predictions = lr_model.transform(dataset=unlabeled_data)
predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|398.22602665513114|
|[31.3662121671876...| 426.8800843190229|
|[31.4252268808548...| 534.0666801427283|
|[31.4474464941278...| 426.6067717514359|
|[31.5316044825729...| 431.5868306341256|
+--------------------+------------------+
only showing top 5 rows



## 7. Tip: change categorical values at a column to numerical values

In [44]:
# Below is pseuso code
from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCol='Name of Categorical column',  # at df_old
                        outputCol='Name of new numerical column')
df_new = indexer.fit(df_old).transform(df_old)