### Final Project
MScA 31013 Big Data Platforms
# Machine Learning Models for Binary Classification: Predicting Tip {Y/N}

In [1]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,IndexToString,VectorAssembler,OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark.ml.tuning as tune
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml import PipelineModel
import pandas as pd
import numpy as np

sc = spark.sparkContext
spark = SparkSession.builder.appName('BDP-GroupProject').getOrCreate()

spark.conf.set("spark.sql.debug.maxToStringFields", 50)

*NOTE: received error on tree classifiers for including too many categorical features, dropping community pickup/dropoff features*

In [2]:
def read_data(path):
    table = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .option("multiline", True)\
    .csv(path,inferSchema=True, header=True )
    return table

# read in modeling dataset
df0 = spark.read.parquet('gs://big-data-final/model-data/final-model-with-feature.parquet')

# select subset of features
var_list = [
 'label',
 'fare_add',
 'add_charge_pct',
 'trip_seconds',
 'trip_miles',
 'trip_start_year',
#'trip_start_month',
 'winter',
 'spring',
 'summer',
 'autumn',
 'trip_start_dow',
 'ride_type',
 'shared_trip_authorized',
 'rain_snow',
 'community_eventCnt',
#'pickup_community_name',
#'dropoff_community_name',
 'outside_chicago_ride',
 'covid_deaths_sma7'
]
model_df = df0.select(var_list)

                                                                                

# Data Engineering Cont.

## Balance Data by Undersampling Rides with No Tip 

Only 28% of the rides include a tip. Create a balanced dataset by splitting the data by tip/no tip and then undersampling the observations with no tip.

`tip` counts
```
modData.groupby('tip').count().show()
+---+---------+
|tip|    count|
+---+---------+
|  1|129935333|
|  0|332379528|
+---+---------+
```

In [3]:
#-----
# count grouping by tip Y/N
#-----
#model_df.groupby('label').count().show()

# number of rides with tip Y
tip1 = 129935333
# number of rides with tip N
tip0 = 332379528

# split model_df by tip label
major_df = model_df.filter(col("label") == 0)
minor_df = model_df.filter(col("label") == 1)
# ratio of majority count to minority count
ratio = tip0 / tip1
# undersample observations with no tip (label 0)
sampled_majority_df = major_df.sample(withReplacement=False, 
                                      fraction=1/ratio, 
                                      seed=7)
# union undersampled majority and minority
undersampled_df = sampled_majority_df.unionAll(minor_df)

## Feature Generation

In [4]:
#-----
# Create StringIndexer to OneHotEncoder
#-----
# dow
dow_indexer = StringIndexer(inputCol='trip_start_dow', 
                            outputCol='dow_idx')
dow_encoder = OneHotEncoder(inputCol='dow_idx',
                            outputCol='dow_vec')
# month
#month_indexer = StringIndexer(inputCol='trip_start_month', 
#                              outputCol='month_idx')
#month_encoder = OneHotEncoder(inputCol='month_idx',
#                              outputCol='month_vec')
# pickup community area
#pickup_indexer = StringIndexer(inputCol='pickup_community_name', 
#                               outputCol='pickup_idx')
#pickup_encoder = OneHotEncoder(inputCol='pickup_idx', 
#                               outputCol='pickup_vec')
# dropoff community area
#dropoff_indexer = StringIndexer(inputCol='dropoff_community_name', 
#                                outputCol='dropoff_idx')
#dropoff_encoder = OneHotEncoder(inputCol='dropoff_idx', 
#                                outputCol='dropoff_vec')

#-----
# Make a VectorAssembler
#-----
vectorAssembler = VectorAssembler(inputCols=['fare_add',
                                             'add_charge_pct',
                                             'trip_seconds',
                                             'trip_miles',
                                             'ride_type',
                                             'shared_trip_authorized',
                                             'rain_snow',
                                             'community_eventCnt',
                                             'outside_chicago_ride',
                                             'covid_deaths_sma7',
                                             'trip_start_year',
                                             'winter',
                                             'spring',
                                             'summer',
                                             'dow_vec'#,'month_vec',
                                             #'pickup_idx','pickup_vec',
                                             #'dropoff_idx','dropoff_vec'
                                            ],
                                  outputCol='features')

#-----
# Standardize features
#-----
scaler = StandardScaler(inputCol="features",   
                        outputCol="scaledFeatures")

#-----
# Make the pipeline
#-----
transit_pipe = Pipeline(stages=[dow_indexer,dow_encoder,
                                #month_indexer,month_encoder,
                                #pickup_indexer,pickup_encoder,
                                #dropoff_indexer,dropoff_encoder,
                                vectorAssembler,
                                scaler])

#-----
# Fit and transform the training data
#-----
piped_df = transit_pipe.fit(undersampled_df).transform(undersampled_df)

                                                                                

**Day of Week**

`trip_start_dow`
- 1 = Sunday
- 7 = Saturday

`dow_idx` is reindexed by frequency

In [23]:
piped_df.select('trip_start_dow','dow_idx','dow_vec').distinct().show()

                                                                                

+--------------+-------+-------------+
|trip_start_dow|dow_idx|      dow_vec|
+--------------+-------+-------------+
|             7|    0.0|(6,[0],[1.0])|
|             4|    3.0|(6,[3],[1.0])|
|             2|    6.0|    (6,[],[])|
|             6|    1.0|(6,[1],[1.0])|
|             5|    2.0|(6,[2],[1.0])|
|             1|    4.0|(6,[4],[1.0])|
|             3|    5.0|(6,[5],[1.0])|
+--------------+-------+-------------+



## Split data into training & test sets

In [5]:
# Split the data into training and test sets.
training, test = piped_df.randomSplit([0.7, 0.3],0.0)

# Logistic Regression

Train Model

In [None]:
# Initiate logistic regression object
lr = LogisticRegression(featuresCol='features', labelCol='label', 
                        maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)

# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

22/11/27 02:06:40 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/11/27 02:06:40 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

Coefficients: 
1 X 20 CSRMatrix

Intercept: [-0.3516756693617446]


Performance on training data

In [33]:
#-----
# predictions on training data
#-----
lrTrainPred = lrModel.transform(training)

#-----
# evaluate performance on training data
#-----
lrTrainEval = BinaryClassificationEvaluator()
lrTrainEval2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

lrTrainAUC = lrTrainEval.evaluate(lrTrainPred, {lrTrainEval.metricName: "areaUnderROC"})
lrTrain_accuracy = lrTrainEval2.evaluate(lrTrainPred, {lrTrainEval2.metricName: "accuracy"})
lrTrain_f1 = lrTrainEval2.evaluate(lrTrainPred, {lrTrainEval2.metricName: "f1"})
lrTrain_weightedPrecision = lrTrainEval2.evaluate(lrTrainPred, {lrTrainEval2.metricName: "weightedPrecision"})
lrTrain_weightedRecall = lrTrainEval2.evaluate(lrTrainPred, {lrTrainEval2.metricName: "weightedRecall"})

print("Train Area Under ROC: ", lrTrainAUC)
print("accuracy: ",lrTrain_accuracy)
print("f1: ",lrTrain_f1)
print("weighted precision: ",lrTrain_weightedPrecision)
print("weighted recall: ",lrTrain_weightedRecall)



Train Area Under ROC:  0.5
accuracy:  0.5870238653080143
f1:  0.4342682249132798
weighted precision:  0.34459701844116175
weighted recall:  0.5870238653080143


                                                                                

Performance on test data

In [None]:
#-----
# predictions on test data
#-----
lrTestPred = lrModel.transform(test)

#-----
# evaluate performance on test data
#-----
lrTestEval = BinaryClassificationEvaluator()
lrTestEval2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

lrTestAUC = lrTestEval.evaluate(lrTestPred, {lrTestEval.metricName: "areaUnderROC"})
lrTest_accuracy = lrTestEval2.evaluate(lrTestPred, {lrTestEval2.metricName: "accuracy"})
lrTest_f1 = lrTestEval2.evaluate(lrTestPred, {lrTestEval2.metricName: "f1"})
lrTest_weightedPrecision = lrTestEval2.evaluate(lrTestPred, {lrTestEval2.metricName: "weightedPrecision"})
lrTest_weightedRecall = lrTestEval2.evaluate(lrTestPred, {lrTestEval2.metricName: "weightedRecall"})

print("Test Area Under ROC: ", lrTestAUC)
print("accuracy: ",lrTest_accuracy)
print("f1: ",lrTest_f1)
print("weighted precision: ",lrTest_weightedPrecision)
print("weighted recall: ",lrTest_weightedRecall)



Test Area Under ROC:  0.5
accuracy:  0.5870924846657423
f1:  0.4343509768727654
weighted precision:  0.3446775855509948
weighted recall:  0.5870924846657423


                                                                                

# Random Forest Classifier

Train model

In [6]:
rf = RandomForestClassifier(featuresCol='features', labelCol='label')
rfModel = rf.fit(training)

                                                                                

Performance on training data

In [None]:
#-----
# predictions on training data
#-----
rfTrainPred = rfModel.transform(training)

#-----
# evaluate performance on training data
#-----
rfTrainEval = BinaryClassificationEvaluator()
rfTrainEval2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

rfTrainAUC = rfTrainEval.evaluate(rfTrainPred, {rfTrainEval.metricName: "areaUnderROC"})
rfTrain_accuracy = rfTrainEval2.evaluate(rfTrainPred, {rfTrainEval2.metricName: "accuracy"})
rfTrain_f1 = rfTrainEval2.evaluate(rfTrainPred, {rfTrainEval2.metricName: "f1"})
rfTrain_weightedPrecision = rfTrainEval2.evaluate(rfTrainPred, {rfTrainEval2.metricName: "weightedPrecision"})
rfTrain_weightedRecall = rfTrainEval2.evaluate(rfTrainPred, {rfTrainEval2.metricName: "weightedRecall"})

print("Train Area Under ROC: ", rfTrainAUC)
print("accuracy: ",rfTrain_accuracy)
print("f1: ",rfTrain_f1)
print("weighted precision: ",rfTrain_weightedPrecision)
print("weighted recall: ",rfTrain_weightedRecall)



Train Area Under ROC:  0.6506877522801002
accuracy:  0.6435837287978274
f1:  0.5977381767908745
weighted precision:  0.6523021158942635
weighted recall:  0.6435837287978274


                                                                                

Performance on test data

In [30]:
#-----
# predictions on test data
#-----
rfTestPred = rfModel.transform(test)

#-----
# evaluate performance on test data
#-----
rfTestEval = BinaryClassificationEvaluator()
rfTestEval2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

rfTestAUC = rfTestEval.evaluate(rfTestPred, {rfTestEval.metricName: "areaUnderROC"})
rfTest_accuracy = rfTestEval2.evaluate(rfTestPred, {rfTestEval2.metricName: "accuracy"})
rfTest_f1 = rfTestEval2.evaluate(rfTestPred, {rfTestEval2.metricName: "f1"})
rfTest_weightedPrecision = rfTestEval2.evaluate(rfTestPred, {rfTestEval2.metricName: "weightedPrecision"})
rfTest_weightedRecall = rfTestEval2.evaluate(rfTestPred, {rfTestEval2.metricName: "weightedRecall"})

print("Test Area Under ROC: ", rfTestAUC)
print("accuracy: ",rfTest_accuracy)
print("f1: ",rfTest_f1)
print("weighted precision: ",rfTest_weightedPrecision)
print("weighted recall: ",rfTest_weightedRecall)



Test Area Under ROC:  0.6508419364707961
accuracy:  0.6436822962576967
f1:  0.5978301480372933
weighted precision:  0.6524174129982206
weighted recall:  0.6436822962576967


                                                                                

Feature importance

In [17]:
rfModel.featureImportances

SparseVector(20, {0: 0.1613, 1: 0.101, 2: 0.0246, 3: 0.0657, 4: 0.4922, 5: 0.1034, 7: 0.0, 8: 0.0016, 9: 0.0269, 10: 0.0214, 11: 0.0005, 12: 0.0, 13: 0.0, 14: 0.0008, 16: 0.0002, 17: 0.0, 18: 0.0002, 19: 0.0})

# Gradient-Boosted Tree Classifier

Train model

In [None]:
gbt = GBTClassifier(featuresCol='features',labelCol='label',
                    maxIter=10)
gbtModel = gbt.fit(training)

                                                                                

Performance on training data

In [None]:
#-----
# predictions on training data
#-----
gbtTrainPred = gbtModel.transform(training)

#-----
# evaluate performance on training data
#-----
gbtTrainEval = BinaryClassificationEvaluator()
gbtTrainEval2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

gbtTrainAUC = gbtTrainEval.evaluate(gbtTrainPred, {gbtTrainEval.metricName: "areaUnderROC"})
gbtTrain_accuracy = gbtTrainEval2.evaluate(gbtTrainPred, {gbtTrainEval2.metricName: "accuracy"})
gbtTrain_f1 = gbtTrainEval2.evaluate(gbtTrainPred, {gbtTrainEval2.metricName: "f1"})
gbtTrain_weightedPrecision = gbtTrainEval2.evaluate(gbtTrainPred, {gbtTrainEval2.metricName: "weightedPrecision"})
gbtTrain_weightedRecall = gbtTrainEval2.evaluate(gbtTrainPred, {gbtTrainEval2.metricName: "weightedRecall"})

print("Train Area Under ROC: ", gbtTrainAUC)
print("accuracy: ",gbtTrain_accuracy)
print("f1: ",gbtTrain_f1)
print("weighted precision: ",gbtTrain_weightedPrecision)
print("weighted recall: ",gbtTrain_weightedRecall)



Train Area Under ROC:  0.6670634897133956
accuracy:  0.6529881826605598
f1:  0.6217288998968149
weighted precision:  0.6542623111260735
weighted recall:  0.6529881826605599


                                                                                

Performance on test data

In [None]:
#-----
# predictions on test data
#-----
gbtTestPred = gbtModel.transform(test)

#-----
# evaluate performance on test data
#-----
gbtTestEval = BinaryClassificationEvaluator()
gbtTestEval2 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

gbtTestAUC = gbtTestEval.evaluate(gbtTestPred, {gbtTestEval.metricName: "areaUnderROC"})
gbtTest_accuracy = gbtTestEval2.evaluate(gbtTestPred, {gbtTestEval2.metricName: "accuracy"})
gbtTest_f1 = gbtTestEval2.evaluate(gbtTestPred, {gbtTestEval2.metricName: "f1"})
gbtTest_weightedPrecision = gbtTestEval2.evaluate(gbtTestPred, {gbtTestEval2.metricName: "weightedPrecision"})
gbtTest_weightedRecall = gbtTestEval2.evaluate(gbtTestPred, {gbtTestEval2.metricName: "weightedRecall"})

print("Test Area Under ROC: ", gbtTestAUC)
print("accuracy: ",gbtTest_accuracy)
print("f1: ",gbtTest_f1)
print("weighted precision: ",gbtTest_weightedPrecision)
print("weighted recall: ",gbtTest_weightedRecall)



Test Area Under ROC:  0.6670146954786423
accuracy:  0.6530384023754677
f1:  0.6217642958072156
weighted precision:  0.6543046847789079
weighted recall:  0.6530384023754678


                                                                                

Feature importance

In [18]:
gbtModel.featureImportances

SparseVector(20, {0: 0.2241, 1: 0.1259, 2: 0.0628, 3: 0.0619, 4: 0.2155, 5: 0.0664, 7: 0.0065, 8: 0.0012, 9: 0.0574, 10: 0.0771, 11: 0.0545, 12: 0.0266, 13: 0.0133, 14: 0.0015, 16: 0.0023, 18: 0.0031})

# Save Trained Models

In [16]:
#https://www.sparkitecture.io/machine-learning/model-saving-and-loading
lrModel.save("/mnt/trainedmodels/lr")
rfModel.save("/mnt/trainedmodels/rf")
gbtModel.save("/mnt/trainedmodels/gbt")

                                                                                