# Lab 13: Selecting a model with Cross Validation

As always, we create a SparkContext/HiveContext.

In [None]:
# Set up Spark Context
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import *

SparkContext.setSystemProperty('spark.executor.memory', '2g')
conf = SparkConf()
conf.set('spark.executor.instances', 15)
conf.set('spark.sql.autoBroadcastJoinThreshold', 100*1024*1024)  # 100MB for broadcast join
sc = SparkContext('yarn-client', 'Spark-lab13', conf=conf)

from pyspark.sql import HiveContext
hc = HiveContext(sc)
hc.sql("use demo")

In [None]:
def eval_metrics(lap):
    tp = float(len(lap[(lap['label']==1) & (lap['prediction']==1)]))
    tn = float(len(lap[(lap['label']==0) & (lap['prediction']==0)]))
    fp = float(len(lap[(lap['label']==0) & (lap['prediction']==1)]))
    fn = float(len(lap[(lap['label']==1) & (lap['prediction']==0)]))
    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    return {'precision': precision, 'recall': recall, 'accuracy': accuracy}

As before, prepare the training and testing datasets:
1. Load the feature matrix created in lab 10 into a Spark dataframe called 'fm'
2. Split into two dataframes - train (2011-2013) and test (only 2014)

In [None]:
<YOUR CODE HERE>

As in lab 12, build instances of the StringIndexer() and OneHotEncoder() for each of the variables, then combine them with a VectorAssembler:

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline

# Build pre-process pipeline
<YOUR CODE HERE>

Now build the complete feature generation and cross validation pipeline:
* Build a CrossValidator instance using LogisticRegression and a paramter grid. In your paramter grid, test the values [0.1, 0.5, 1.0, 5.0] for regularization paramter, and the values [0.0, 0.5, 1.0] for the elasticNetParam.
* Build a pipeline with all the stages plus the cross validator

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Build a parameter grid
cvlr = LogisticRegression(maxIter=100)
grid = (ParamGridBuilder() 
      .addGrid(<YOUR CODE HERE>) 
      .addGrid(<YOUR CODE HERE>)
      .build())
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
cv = CrossValidator(<YOUR CODE HERE>)

pipe = <YOUR CODE HERE>

Run the end-to-end pipeline:
* Store the output of fit() in a variable called "model"
* Apply the fitted model to the test data
* Compute and print the resulting metrics.

In [None]:
model = pipe.<YOUR CODE HERE>
results = model.<YOUR CODE HERE>

print "best AUC-ROC = " + str(evaluator.evaluate(results))
print eval_metrics(<YOUR CODE HERE>)

The "model" variable is of type PipelineModel and includes the fitted stages of each stage in the pipeline. We can look at the final stage (our cross-validation model) and print the intercept and weights for the best model:

In [None]:
bestModel = model.<YOUR CODE HERE>
print bestModel.intercept
print bestModel.weights