# Data sources for MLlib

In [None]:
# Reading CSVs

In [None]:
df = spark.read.format("csv")\
    .option("inferSchema", True)\
    .option("header", True)\
    .load("../gramex/testlib/iris.csv")

In [None]:
df.show(truncate=False)

In [None]:
# From Pandas to Spark DataFrames

from sklearn.datasets import load_boston
import pandas as pd
boston = load_boston()

df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['price'] = boston.target

In [None]:
sdf = sqlContext.createDataFrame(df)
sdf.show(truncate=False)

# Linear Regression with MLlib

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(sdf)

## What went wrong?

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols=[c for c in sdf.columns if c != 'price'],
                            outputCol='features')
dataset = assembler.transform(sdf)

In [None]:
dataset.show()

In [None]:
lr = LinearRegression(featuresCol='features', labelCol='price')
model = lr.fit(dataset)

In [None]:
summary = model.evaluate(dataset)
print(summary.r2)

## Other regression methods

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

In [None]:
dtr = DecisionTreeRegressor(featuresCol='features', labelCol='price')
model = dtr.fit(dataset)

In [None]:
summary = model.evaluate(dataset)
print(summary.r2)

### Different algorithms may have different interfaces!

In [None]:
output = model.transform(dataset)
output.select('prediction').show()

In [None]:
output.select('price').show()

## How do we evaluate this?

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='r2')
evaluator.evaluate(output)

## Exercise: Use `RandomForestRegressor` (provided below) and report the R2 score

In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [None]:
# enter code here

# Classification with MLlib

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
df = spark.read.format("csv")\
    .option("inferSchema", True)\
    .option("header", True)\
    .load("../gramex/testlib/iris.csv")
df.printSchema()

In [None]:
assembler = VectorAssembler(inputCols=[c for c in df.columns if c != 'species'],
                            outputCol='features')

In [None]:
iris = assembler.transform(df)

In [None]:
lr = LogisticRegression(featuresCol='features', labelCol='species')
model = lr.fit(iris)

## What went wrong?

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='species', outputCol='speciesIndex')

iris = indexer.fit(df).transform(df)

iris.sample(fraction=0.1).show()

In [None]:
assembler = VectorAssembler(inputCols=[c for c in df.columns if not c.startswith('species')],
                            outputCol='features')
dataset = assembler.transform(iris)

In [None]:
dataset.printSchema()

In [None]:
lr = LogisticRegression(featuresCol='features', labelCol='speciesIndex')
model = lr.fit(dataset)

In [None]:
summary = model.evaluate(dataset)

In [None]:
summary.accuracy

## That's too convenient

In [None]:
trainData, testData = dataset.randomSplit([0.7, 0.3])

In [None]:
model = lr.fit(trainData)
summary = model.evaluate(testData)

In [None]:
summary.accuracy

## Exercise: Use the NaiveBayes classifier on the Iris dataset
### Check the accuracy for a 60-40 train / test split.
### Hint: Use the `MulticlassClassificationEvaluator`

In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# enter code here