# Linear Regression with Spark MLlib

In [None]:
from sklearn import datasets
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
boston = datasets.load_boston()

df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['price'] = boston.target
df.head()

In [None]:
sdf = sqlContext.createDataFrame(df)
sdf.show(truncate=False)

In [None]:
assembler = VectorAssembler(inputCols=[c for c in sdf.columns if c != 'price'],
                            outputCol='features')
dataset = assembler.transform(sdf)
sdf.show(truncate=False)

In [None]:
dataset.select("features").show(truncate=False)

In [None]:
dataset.cache() # <---

In [None]:
lr = LinearRegression(featuresCol='features', labelCol='price')
model = lr.fit(dataset)

## Importance of Individual Features

In [None]:
model.coefficients

In [None]:
assembler.getInputCols()

In [None]:
pd.Series(np.array(model.coefficients), index=assembler.getInputCols()).plot(kind='bar')

# Getting Metrics

In [None]:
summary = model.evaluate(dataset)
summary.r2

In [None]:
summary.explainedVariance

## Training on a subset of the data

In [None]:
subset = sdf.select("CHAS", "NOX", "RM", "price")
assembler = VectorAssembler(inputCols=[c for c in subset.columns if c != 'price'],
                            outputCol='features')
small_dataset = assembler.transform(subset)
small_dataset.show()

In [None]:
lr = LinearRegression(featuresCol='features', labelCol='price')
small_model = lr.fit(small_dataset)
summary = small_model.evaluate(small_dataset)

In [None]:
summary.r2

In [None]:
summary.explainedVariance

## Exercise: Fit a Linear Regression on the Diabetes Dataset
### Step 1: Create a spark DataFrame, and prepare the dataset
### Step 2: Find the R2 score and explained variance
### Step 3: Find a subset of features with the highest absolute coefficients (by plotting)
### Step 4: Train a new model on this subset, and find R2 and explained variance

In [None]:
# Step 1
diabetes = datasets.load_diabetes()
# complete code below
df = None
assembler = None
dataset = None

In [None]:
# Step 2 - complete code below
lr = None
model = None
summary = None
print(summary.r2, summary.explainedVariance)

In [None]:
# Step 3:
# enter code here

In [None]:
# Step 4 - complete code below
subset = None
assembler = None
small_dataset = None

lr = None
model = None
summary = None
print(summary.r2, summary.explainedVariance)