## Statistics & Linear Regression

### In this session you will have an opportunity to interact with the MLlib for Statistics & Linear Regression
1. Read a CSV file to a DataFrame
2. Convert to rdd
2. Do some basic Stats - mean, sd
3. Exercise on Correlation
4. Linear Regression mpg ~ .
4. Lasso & Ridge Regression

The Coding Exercises would give a good start

In [None]:
from pyspark.context import SparkContext
print "Running Spark Version %s" % (sc.version)

In [None]:
from pyspark.conf import SparkConf
conf = SparkConf()
print conf.toDebugString()

In [None]:
import datetime
from pytz import timezone
print "Last run @%s" % (datetime.datetime.now(timezone('US/Pacific')))

In [None]:
df_cars = sqlContext.read.load('cars_1.parquet')

In [None]:
df_cars.count()

In [None]:
df_cars.show(40)

In [None]:
df_cars = df_cars.na.drop()

In [None]:
cars_rdd = df_cars.map(lambda x:[x])

In [None]:
cars_rdd.take(1)

In [None]:
from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(cars_rdd)

In [None]:
print str(summary)

In [None]:
summary.min()

In [None]:
for x in summary.min():
    print "|%6.2f" % x,
print
for x in summary.mean():
    print "|%6.2f" % x,
print
for x in summary.max():
    print "|%6.2f" % x,
print

### Calculate & Plot Correlations

In [None]:
hp = cars_rdd.map(lambda x: x[0][2])
weight = cars_rdd.map(lambda x: x[0][10])
print '%2.3f' % Statistics.corr(hp, weight, method="pearson")
print '%2.3f' % Statistics.corr(hp, weight, method="spearman")

In [None]:
print hp

In [None]:
import pandas as pd
from ggplot import *
%matplotlib inline

df = pd.DataFrame({'HP': hp.collect(),'Weight':weight.collect()})

ggplot(df, aes(x='HP', y='Weight')) +\
  geom_point() + labs(title="Car-Attributes", x="Horsepower", y="Weight")

### Coding Excrcise
1. Calculate the correlation between Rear Axle Ratio & the Width
2. Plot & verify

In [None]:
ra_ratio = cars_rdd.<FILL IN>
width = cars_rdd.<FILL IN>
print '%2.3f' % Statistics.corr(ra_ratio, width, method="pearson")
print '%2.3f' % Statistics.corr(ra_ratio, width, method="spearman")

In [None]:
df = pd.DataFrame(<FILL IN>

ggplot(<FILL IN>

# Linear Regression

In [None]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.regression import LassoWithSGD
from pyspark.mllib.regression import RidgeRegressionWithSGD
from numpy import array

In [None]:
data = [
   LabeledPoint(0.0, [0.0]),
   LabeledPoint(10.0, [10.0]),
   LabeledPoint(20.0, [20.0]),
   LabeledPoint(30.0, [30.0])
]
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
print lrm
print lrm.weights
print lrm.intercept
lrm.predict([40])

In [None]:
data_test = [
   LabeledPoint(5.0, [5.0]),
   LabeledPoint(15.0, [15.0]),
   LabeledPoint(25.0, [25.0]),
   LabeledPoint(35.0, [35.0])
]
data_test_rdd = sc.parallelize(data_test)
valuesAndPreds = data_test_rdd.map(lambda p: (p.label, lrm.predict(p.features)))
#
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

In [None]:
valuesAndPreds.take(10)

### TIP : Step Size is important

In [None]:
data = [
   LabeledPoint(0.0, [0.0]),
   LabeledPoint(9.0, [10.0]),
   LabeledPoint(22.0, [20.0]),
   LabeledPoint(32.0, [30.0])
]
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0])) # should be 1.09x -0.60
        # Default step size of 1.0 will diverge
print "Step Size 1.0 (Default)"
print lrm
print lrm.weights
print lrm.intercept
print "%3.3f" % lrm.predict([40])
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]), step=0.01) # should be 1.09x -0.60
        # Default step size of 1.0 will diverge
print
print "Step Size 0.01"
print lrm
print lrm.weights
print lrm.intercept
print "%3.3f" % lrm.predict([40])

In [None]:
data = [
   LabeledPoint(18.9, [3910.0]),
   LabeledPoint(17.0, [3860.0]),
   LabeledPoint(20.0, [4200.0]),
   LabeledPoint(16.6, [3660.0])
]
lrm = LinearRegressionWithSGD.train(sc.parallelize(data), step=0.00000001) # should be ~ 0.006582x -7.595170
print lrm
print lrm.weights
print lrm.intercept
lrm.predict([4000])

### Homework 

1.  Convert the car data to labelled points
2.  Partition to Train & Test (Weight < 4000 and >= 4000)
3.  Train the three Linear Models
4.  Calculate the MSE for the three models

In [None]:
from pyspark.mllib.regression import LabeledPoint
def parse_car_data(x):
    # return labelled point
    return LabeledPoint(x[0][0],[ x[0][1],x[0][2],x[0][3],x[0][4],x[0][5],
                                 x[0][6],x[0][7],x[0][8],x[0][9],x[0][10],x[0][11] ]) 

In [None]:
car_rdd_lp = cars_rdd.map(<FILL IN>
print car_rdd_lp.count()
print car_rdd_lp.first().label
print car_rdd_lp.first().features

In [None]:
car_rdd_train = car_rdd_lp.filter(<FILL IN>
car_rdd_train.count()

In [None]:
car_rdd_test = car_rdd_lp.filter(<FILL IN>
car_rdd_test.count()

In [None]:
car_rdd_train.take(5)

In [None]:
car_rdd_test.take(5)

In [None]:
lrm = LinearRegressionWithSGD.train(<FILL IN>
print lrm
print lrm.weights
print lrm.intercept

In [None]:
valuesAndPreds = car_rdd_test.map(lambda p: (p.label, lrm.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

In [None]:
valuesAndPreds.take(20)

In [None]:
lrm = LassoWithSGD.train(<FILL IN>
print lrm.weights
print lrm.intercept
valuesAndPreds = car_rdd_test.map(lambda p: (p.label, lrm.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

In [None]:
valuesAndPreds.take(20)

In [None]:
lrm = RidgeRegressionWithSGD.train(<FILL IN>
print lrm.weights
print lrm.intercept
valuesAndPreds = car_rdd_test.map(lambda p: (p.label, lrm.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

## That is All Folks !