# Lab 11: Predictive model with Logistic Regression

As always, we create a SparkContext/HiveContext.

In [1]:
# Set up Spark Context
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import *

SparkContext.setSystemProperty('spark.executor.memory', '2g')
conf = SparkConf()
conf.set('spark.executor.instances', 15)
conf.set('spark.sql.autoBroadcastJoinThreshold', 100*1024*1024)  # 100MB for broadcast join
sc = SparkContext('yarn-client', 'Spark-lab11', conf=conf)

from pyspark.sql import HiveContext
hc = HiveContext(sc)
hc.sql("use demo")

DataFrame[result: string]

Let's load the feature matrix created in lab 10 into a Spark dataframe called 'fm', using the data frames Reader API:

In [2]:
fm = hc.read.format("orc").load("/tmp/fm")
fm.limit(5).toPandas()


Unnamed: 0,year,month,day,prcp,tmin,tmax,hour,resolved,category,district,dayofweek,description,neighborhood
0,2011,1,1,135,67,94,9,1,OTHER OFFENSES,PARK,Saturday,FALSE PERSONATION,Haight Ashbury
1,2011,1,1,135,67,94,22,0,SUSPICIOUS OCC,TENDERLOIN,Saturday,SUSPICIOUS OCCURRENCE,Downtown/Civic Center
2,2011,1,1,135,67,94,19,1,DRUG/NARCOTIC,TENDERLOIN,Saturday,POSSESSION OF MARIJUANA,Downtown/Civic Center
3,2011,1,1,135,67,94,0,0,FRAUD,NORTHERN,Saturday,"CREDIT CARD, THEFT BY USE OF",Downtown/Civic Center
4,2011,1,1,135,67,94,0,0,OTHER OFFENSES,BAYVIEW,Saturday,FALSE PERSONATION TO RECEIVE MONEY OR PROPERTY,Excelsior


Split the dataset into a training and testing set as follows:
1. Use years 2011-2013 for training your model.
2. use the year 2014 as your test set.

In [3]:
trainData = fm.filter(fm.year<=2013)
testData = fm.filter(fm.year>=2014)

print trainData.count(), testData.count()

426306 150155


Using Spark ML's pipeline API, create the components of an end-to-end pipeline as follows:
1. Use the StringIndexer() transformation to convert all string variables (category, dayofweek, district, neighborhood) into categorical variables
2. Similarly, convert the "resolved" variable to a categorical variable called "label". We need to do this since Spark-ML Logistic Regression requires a categorical variable as the target variable, whereas "resolved" is a numerical variable with values 0.0 and 1.0.
3. Use VectorAssembler to create a "features" column that combines all the features of the model: month, hour, prcp, tmin, tmax, and the other categorical variables. Call the output column "features"

In [4]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

inx1 = StringIndexer(inputCol="category", outputCol="cat-inx")
inx2 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx")
inx3 = StringIndexer(inputCol="district", outputCol="dis-inx")
inx4 = StringIndexer(inputCol="neighborhood", outputCol="ngh-inx")
inx5 = StringIndexer(inputCol="resolved", outputCol="label")
vecAssembler = VectorAssembler(inputCols =["month", "hour", "prcp", "tmin", "tmax", \
                                           "cat-inx", "dow-inx", "dis-inx", "ngh-inx"], 
                               outputCol="features")

Create a Logistic Regression classifier with reasonable paramter settings such as maxIter=30 and regParam=0.01:

In [5]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=30, regParam=0.01)

Create the spark-ML pipeline to combine all the processing steps. Then train the model using the training set:

In [6]:
pipeline_lr = Pipeline(stages=[inx1, inx2, inx3, inx4, inx5, vecAssembler, lr])
model_lr = pipeline_lr.fit(trainData)

Compute the predictions using the testData:

In [7]:
results = model_lr.transform(testData).cache()

Create a Pandas data frame from your results data frame, and use the eval_metrics function to compute the precision, recall and accuracy of the current model:

In [8]:
def eval_metrics(lap):
    tp = float(len(lap[(lap['label']==1) & (lap['prediction']==1)]))
    tn = float(len(lap[(lap['label']==0) & (lap['prediction']==0)]))
    fp = float(len(lap[(lap['label']==0) & (lap['prediction']==1)]))
    fn = float(len(lap[(lap['label']==1) & (lap['prediction']==0)]))
    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    return {'precision': precision, 'recall': recall, 'accuracy': accuracy}

Use this function to compute the precision, recall and accuracy of the current model:

In [9]:
lap = results.select("label", "prediction").toPandas()
print eval_metrics(lap)

{'recall': 0.15723408940761946, 'precision': 0.5328358208955224, 'accuracy': 0.6441277346741701}


With Logistic Regression, you can print the trained model's weights and intercept coefficients.

In [10]:
print model_lr.stages[-1].weights
print model_lr.stages[-1].intercept

[-0.0229020936192,-0.00106710516373,-0.000454037952796,0.000157987875918,0.000136662126704,0.0507473781292,0.00523966744931,0.0450667138353,-0.0398805346853]
-0.427123334573


Note that the recall is relatively low. One possible cause for this might be that our categorical variables are represented as numerical values in our regression model. Create a different Spark-ML pipeline that uses OneHotEncoder to transform some of these categorical variables into dummy variables and re-run the logistic regression model. 

Did the results improve?

In [11]:
from pyspark.ml.feature import OneHotEncoder

inx1 = StringIndexer(inputCol="category", outputCol="cat-inx")
ohe1 = OneHotEncoder(inputCol="cat-inx", outputCol="cat-ohe")
inx2 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx")
inx3 = StringIndexer(inputCol="district", outputCol="dis-inx")
ohe3 = OneHotEncoder(inputCol="dis-inx", outputCol="dis-ohe")
inx4 = StringIndexer(inputCol="neighborhood", outputCol="ngh-inx")
ohe4 = OneHotEncoder(inputCol="ngh-inx", outputCol="ngh-ohe")
inx5 = StringIndexer(inputCol="resolved", outputCol="label")
vecAssembler = VectorAssembler(inputCols =["month", "hour", "prcp", "tmin", "tmax", \
                                           "cat-ohe", "dow-inx", "dis-ohe", "ngh-ohe"], 
                               outputCol="features")

lr = LogisticRegression(maxIter=30)
lr.setFitIntercept(True)
pipeline_lr = Pipeline(stages=[inx1, ohe1, inx2, inx3, ohe3, inx4, ohe4, inx5, vecAssembler, lr])
model_lr = pipeline_lr.fit(trainData)
results = model_lr.transform(testData).cache()
lap = results.select("label", "prediction").toPandas()
print eval_metrics(lap)

{'recall': 0.6437825735887837, 'precision': 0.7542678993764782, 'accuracy': 0.794612234024841}
