# 36. Linear Regression Documentation

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('lr').getOrCreate()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
training = spark.read.format('libsvm').load('/FileStore/tables/sample_linear_regression_data.txt')

In [0]:
training.show()

In [0]:
lr = LinearRegression(featuresCol='features'
                      ,labelCol='label'
                      ,predictionCol='prediction'
                     )

In [0]:
lrModel = lr.fit(training)

In [0]:
lrModel.coefficients

In [0]:
lrModel.intercept

In [0]:
training_summary = lrModel.summary

In [0]:
training_summary.r2

In [0]:
training_summary.rootMeanSquaredError

In [0]:
data_all = spark.read.format('libsvm').load('/FileStore/tables/sample_linear_regression_data.txt')

In [0]:
data_split = data_all.randomSplit([0.7,0.3])

In [0]:
data_split

In [0]:
train_data, test_data = data_all.randomSplit([0.7,0.3])

In [0]:
train_data.describe().show()

In [0]:
train_model = lr.fit(train_data)

In [0]:
test_results = train_model.evaluate(test_data)

In [0]:
test_results.rootMeanSquaredError

In [0]:
data_unlabeled = test_data.select('features')

In [0]:
data_unlabeled.show()

In [0]:
predictions = train_model.transform(data_unlabeled)

In [0]:
predictions.show()

# 38. Linear Regression Example Code Along

In [0]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [0]:
data = spark.read.csv('/FileStore/tables/Ecommerce_Customers.csv',inferSchema=True,header=True)

In [0]:
data.printSchema()

In [0]:
for item in data.head(1)[0]:
  print(item)

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

In [0]:
assembler = VectorAssembler(inputCols=['Avg Session Length'
                                       ,'Time on App'
                                       , 'Time on Website'
                                       , 'Length of Membership']
                           ,outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
output.printSchema()

In [0]:
output.head(1)

In [0]:
data_final = output.select('features','Yearly Amount Spent')

In [0]:
train_data, test_data = data_final.randomSplit([0.7,0.3])

In [0]:
train_data.describe().show()

In [0]:
test_data.describe().show()

In [0]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [0]:
lr_model = lr.fit(train_data)

In [0]:
test_results = lr_model.evaluate(test_data)

In [0]:
test_results.residuals.show()

In [0]:
test_results.rootMeanSquaredError

In [0]:
test_results.r2

In [0]:
data_final.describe().show()

In [0]:
data_unlabeled = test_data.select('features')

In [0]:
predictions = lr_model.transform(data_unlabeled)

In [0]:
predictions.show()