# Linear Regression

In [None]:
import pyspark
import sys

In [None]:
import pyspark.sql.functions as fn

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [None]:
# Check spark app name
spark.sparkContext.appName

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [None]:
# print runtime versions
# Python version
sys.version

In [None]:
# Spark version
spark.version

### Exploring Data

In [None]:
# load iris.csv into Spark dataframe
df = spark.read.csv('data/lr_dataset.csv', header=True, inferSchema=True)

In [None]:
#validate the size of data
df.count(), len(df.columns)

In [None]:
# First 5 rows of Iris dataset
df.show(5)

In [None]:
df.printSchema()

In [None]:
df.describe().show()

In [None]:
# check for correlation
df.select(fn.corr('var_1','output')).show()

### Feature Engineering

In [None]:
# display all column names
df.columns

In [None]:
# vectorize all numerical columns into a single feature column
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
features_df = assembler.transform(df)

In [None]:
# validate the presence of dense vectors 
features_df.printSchema()

In [None]:
# view the details of dense vector
features_df.select('features').show(5,False)

In [None]:
# only select the features and label column
model_df = features_df.select(['features', 'output'])

In [None]:
# Reading for machine learning
model_df.show(10,False)

In [None]:
# size of model df
model_df.count(), len(model_df.columns)

### Split Data - Train & Test sets

In [None]:
# use Logistic Regression to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

In [None]:
train_df.count(), len(train_df.columns)

In [None]:
test_df.count(), len(test_df.columns)

### Build Linear Regression Model 

In [None]:
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='output')

In [None]:
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [None]:
lr_model.intercept

In [None]:
lr_model.coefficients

In [None]:
training_predictions=lr_model.evaluate(train_df)

In [None]:
training_predictions.meanSquaredError

In [None]:
training_predictions.r2

### Evaluate Model

In [None]:
# make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [None]:
# view the residual errors based on predictions 
test_results.residuals.show(10,False)

In [None]:
# coefficient of determination value for model
test_results.r2

In [None]:
# RMSE
test_results.rootMeanSquaredError

In [None]:
# MSE
test_results.meanSquaredError