In [1]:
from pyspark.sql import SparkSession


In [2]:
spark=SparkSession.builder.appName('lr_example').getOrCreate()

In [15]:
data=spark.read.csv('Ecommerce_Customers.csv',header=True,inferSchema=True)

In [12]:
spark.sparkContext.textFile('Ecommerce_Customers.csv').flatMap(lambda x : x.split(',')).take(5)

['Email', 'Address', 'Avatar', 'Avg Session Length', 'Time on App']

In [16]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [20]:
for item in data.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [22]:
from pyspark.ml.regression import LinearRegression

In [23]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


In [24]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [25]:
assembler=VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website',\
                                     'Length of Membership'],outputCol='features')

In [26]:
output=assembler.transform(data)
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [29]:
final_data=output.select('features','Yearly Amount Spent')
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [30]:
lr=LinearRegression(labelCol='Yearly Amount Spent')
lr_model=lr.fit(train_data)



In [31]:
test_results=lr_model.evaluate(test_data)
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 10.929444229696287|
| -6.735620966548822|
|-3.8969936455564493|
| 22.879080324138613|
| 18.889127931078804|
| -9.321813466721949|
| 3.2260572471205364|
|-13.847417046518785|
| 6.4505565608334905|
|-1.1423613079506367|
|  2.771636224040151|
| -4.623198177324014|
|-10.716591153214267|
| -3.837050260273827|
| 0.5091409434978686|
|  8.029215893203059|
| 16.726699886530696|
|   5.78074825142437|
|  4.954817757494084|
| -2.351363741864816|
+-------------------+
only showing top 20 rows



In [33]:
test_results.r2,test_results.rootMeanSquaredError

(0.9797083225909247, 10.658321448772476)