# Lab 13: Selecting a model with Cross Validation

As always, we create a SparkContext/HiveContext.

In [1]:
# Set up Spark Context
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import *

SparkContext.setSystemProperty('spark.executor.memory', '2g')
conf = SparkConf()
conf.set('spark.executor.instances', 15)
conf.set('spark.sql.autoBroadcastJoinThreshold', 100*1024*1024)  # 100MB for broadcast join
sc = SparkContext('yarn-client', 'Spark-lab13', conf=conf)

from pyspark.sql import HiveContext
hc = HiveContext(sc)
hc.sql("use demo")

DataFrame[result: string]

In [2]:
def eval_metrics(lap):
    tp = float(len(lap[(lap['label']==1) & (lap['prediction']==1)]))
    tn = float(len(lap[(lap['label']==0) & (lap['prediction']==0)]))
    fp = float(len(lap[(lap['label']==0) & (lap['prediction']==1)]))
    fn = float(len(lap[(lap['label']==1) & (lap['prediction']==0)]))
    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    return {'precision': precision, 'recall': recall, 'accuracy': accuracy}

As before, prepare the training and testing datasets:
1. Load the feature matrix created in lab 10 into a Spark dataframe called 'fm'
2. Split into two dataframes - train (2011-2013) and test (only 2014)

In [3]:
fm = hc.read.format("orc").load("/tmp/fm")
trainData = fm.filter(fm.year<=2013).cache()
testData = fm.filter(fm.year>=2014).cache()
print trainData.count(), testData.count()

426306 150155


As in lab 12, build instances of the StringIndexer() and OneHotEncoder() for each of the variables, then combine them with a VectorAssembler:

In [4]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline

# Build pre-process pipeline
inx1 = StringIndexer(inputCol="category", outputCol="cat-inx")
ohe1 = OneHotEncoder(inputCol="cat-inx", outputCol="cat-ohe")
inx2 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx")
inx3 = StringIndexer(inputCol="district", outputCol="dis-inx")
ohe3 = OneHotEncoder(inputCol="dis-inx", outputCol="dis-ohe")
inx4 = StringIndexer(inputCol="neighborhood", outputCol="ngh-inx")
ohe4 = OneHotEncoder(inputCol="ngh-inx", outputCol="ngh-ohe")
inx5 = StringIndexer(inputCol="resolved", outputCol="label")
vecAssembler = VectorAssembler(inputCols =["month", "hour", "prcp", "tmin", "tmax", \
                                           "cat-ohe", "dow-inx", "dis-ohe", "ngh-ohe"], 
                               outputCol="features")

Now build the complete feature generation and cross validation pipeline:
* Build a CrossValidator instance using LogisticRegression and a paramter grid. In your paramter grid, test the values [0.1, 0.5, 1.0, 5.0] for regularization paramter, and the values [0.0, 0.5, 1.0] for the elasticNetParam.
* Build a pipeline with all the stages plus the cross validator

In [5]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Build a parameter grid
cvlr = LogisticRegression(maxIter=100)
grid = (ParamGridBuilder() 
      .addGrid(cvlr.regParam, [0.1, 0.5, 1.0, 5.0]) 
      .addGrid(cvlr.elasticNetParam, [0.0, 0.5, 1.0])
      .build())
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
cv = CrossValidator(estimator=cvlr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3)

pipe = Pipeline(stages=[inx1, ohe1, inx2, inx3, ohe3, inx4, ohe4, inx5, vecAssembler, cv])

Run the end-to-end pipeline:
* Store the output of fit() in a variable called "model"
* Apply the fitted model to the test data
* Compute and print the resulting metrics.

In [6]:
model = pipe.fit(trainData)
results = model.transform(testData).cache()

print "best AUC-ROC = " + str(evaluator.evaluate(results))
print eval_metrics(results.select('label', 'prediction').toPandas())

best AUC-ROC = 0.856759254037
{'recall': 0.6437825735887837, 'precision': 0.7542678993764782, 'accuracy': 0.794612234024841}


The "model" variable is of type PipelineModel and includes the fitted stages of each stage in the pipeline. We can look at the final stage (our cross-validation model) and print the intercept and weights for the best model:

In [7]:
bestModel = model.stages[9].bestModel
print bestModel.intercept
print bestModel.weights

-0.458344636827
[-0.0102934663622,0.00186486807845,-0.000277850255898,9.44566319521e-06,3.79132490894e-05,-1.09280456456,0.94265170687,-0.138998073021,0.201484753953,-0.807486732899,1.42975298722,1.61689763216,-0.786750849599,-0.540394240524,-1.01792172534,1.13860515254,-0.505028230997,-0.662144957732,0.470656863239,1.23475401051,0.858532787717,1.54271132569,-0.0157658732218,1.57691393002,0.16008341433,1.1327439085,-0.927905915448,0.802523880701,1.69137349719,0.533984801236,0.754606678819,1.29404062587,-0.4861696308,-0.43524394352,1.25216721225,0.0652692747964,-0.153615230777,0.806075117836,-0.639106538216,-0.0855644889181,0.337007291552,0.724027149259,0.299620135111,0.00412397645252,0.0266695848157,0.0739579912159,-0.0996848824653,0.0356472363874,-0.195907402578,-0.0204346154383,0.402258491104,-0.0707766567344,-0.0364279346575,0.184874215397,0.042568107117,0.0650824648588,0.15908098627,-0.142024641044,0.00686666282156,-0.033788558764,0.160548419892,-0.0665190575756,-0.0055014329379,-0