### Final Project
MScA 31013 Big Data Platforms
# Machine Learning Models: Predicting Tip Percent

***Tip percent refers to the percent of ride cost (fare + additional charges) that a rider tips***

In [12]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,IndexToString,VectorAssembler,OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark.ml.tuning as tune
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml import PipelineModel
import pandas as pd
import numpy as np

sc = spark.sparkContext
spark = SparkSession.builder.appName('BDP-GroupProject').getOrCreate()

spark.conf.set("spark.sql.debug.maxToStringFields", 50)

In [13]:
def read_data(path):
    table = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .option("multiline", True)\
    .csv(path,inferSchema=True, header=True )
    return table

# read in modeling dataset
df0 = spark.read.parquet('gs://big-data-final/model-data/final-model-with-feature.parquet')

# select subset of features
var_list = [
 'tip_pct',
 'fare_add',
 'add_charge_pct',
 'trip_seconds',
 'trip_miles',
 'trip_start_year',
#'trip_start_month',
 'winter',
 'spring',
 'summer',
 'trip_start_dow',
 'ride_type',
 'shared_trip_authorized',
 'rain_snow',
 'community_eventCnt',
#'pickup_community_name',
#'dropoff_community_name',
 'outside_chicago_ride',
 'covid_deaths_sma7'
]
model_df = df0.select(var_list)

# Data Engineering Cont.

## Feature Generation

In [14]:
#-----
# Create StringIndexer to OneHotEncoder
#-----
# dow
dow_indexer = StringIndexer(inputCol='trip_start_dow', 
                            outputCol='dow_idx')
dow_encoder = OneHotEncoder(inputCol='dow_idx',
                            outputCol='dow_vec')
# month
#month_indexer = StringIndexer(inputCol='trip_start_month', 
#                              outputCol='month_idx')
#month_encoder = OneHotEncoder(inputCol='month_idx',
#                              outputCol='month_vec')
# pickup community area
#pickup_indexer = StringIndexer(inputCol='pickup_community_name', 
#                               outputCol='pickup_idx')
#pickup_encoder = OneHotEncoder(inputCol='pickup_idx', 
#                               outputCol='pickup_vec')
# dropoff community area
#dropoff_indexer = StringIndexer(inputCol='dropoff_community_name', 
#                                outputCol='dropoff_idx')
#dropoff_encoder = OneHotEncoder(inputCol='dropoff_idx', 
#                                outputCol='dropoff_vec')

#-----
# Make a VectorAssembler
#-----
vectorAssembler = VectorAssembler(inputCols=['fare_add',
                                             'add_charge_pct',
                                             'trip_seconds',
                                             'trip_miles',
                                             'ride_type',
                                             'shared_trip_authorized',
                                             'rain_snow',
                                             'community_eventCnt',
                                             'outside_chicago_ride',
                                             'covid_deaths_sma7',
                                             'trip_start_year',
                                             'winter',
                                             'spring',
                                             'summer',
                                             'dow_vec'#,'month_vec',
                                             #'pickup_idx','pickup_vec',
                                             #'dropoff_idx','dropoff_vec'
                                            ],
                                  outputCol='features')

#-----
# Standardize features
#-----
scaler = StandardScaler(inputCol="features",   
                        outputCol="scaledFeatures")

#-----
# Make the pipeline
#-----
transit_pipe = Pipeline(stages=[dow_indexer,dow_encoder,
                                #month_indexer,month_encoder,
                                #pickup_indexer,pickup_encoder,
                                #dropoff_indexer,dropoff_encoder,
                                vectorAssembler,
                                scaler])

#-----
# Fit and transform the training data
#-----
piped_df = transit_pipe.fit(model_df).transform(model_df)

                                                                                

**Day of Week**

`trip_start_dow`
- 1 = Sunday
- 7 = Saturday

`dow_idx` is reindexed by frequency

In [15]:
piped_df.select('trip_start_dow','dow_idx','dow_vec').distinct().show()



+--------------+-------+-------------+
|trip_start_dow|dow_idx|      dow_vec|
+--------------+-------+-------------+
|             7|    0.0|(6,[0],[1.0])|
|             2|    6.0|    (6,[],[])|
|             6|    1.0|(6,[1],[1.0])|
|             5|    2.0|(6,[2],[1.0])|
|             3|    5.0|(6,[5],[1.0])|
|             1|    3.0|(6,[3],[1.0])|
|             4|    4.0|(6,[4],[1.0])|
+--------------+-------+-------------+



                                                                                

## Split data into training & test sets

In [16]:
# Split the data into training and test sets.
training, test = piped_df.randomSplit([0.7, 0.3],0.0)

# Random Forest Regressor

In [17]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

Train model

In [None]:
rf = RandomForestRegressor(labelCol='tip_pct', featuresCol="features")
rfm = rf.fit(training)

                                                                                

Performance on training data

In [None]:
#-----
# predictions on training data
#-----
trainPred = rfm.transform(training)

#-----
# evaluate performance on training data
#-----
evaluator = RegressionEvaluator(labelCol="tip_pct", 
                                predictionCol="prediction", 
                                metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="tip_pct", 
                                   predictionCol="prediction",
                                   metricName="r2")
rmseTrain = evaluator.evaluate(trainPred)                                                                               
r2Train = evaluator_r2.evaluate(trainPred)

print("training rmse: ", rmseTrain)
print("training R2: ", r2Train)



training rmse:  0.7728764866929401
training R2:  0.003499787754531636


                                                                                

Performance on test data

In [None]:
#-----
# predictions on test data
#-----
testPred = rfm.transform(test)

#-----
# evaluate performance on test data
#-----
evaluator = RegressionEvaluator(labelCol="tip_pct", 
                                predictionCol="prediction", 
                                metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="tip_pct", 
                                   predictionCol="prediction",
                                   metricName="r2")
rmseTest = evaluator.evaluate(testPred)                                                                               
r2Test = evaluator_r2.evaluate(testPred)

print("test rmse: ", rmseTest)
print("test R2: ", r2Test)



test rmse:  0.8728565556790877
test R2:  0.002690292800045402


                                                                                

Feature Importance

In [20]:
rfm.featureImportances

SparseVector(20, {0: 0.1451, 1: 0.0932, 2: 0.0613, 3: 0.1574, 4: 0.3511, 5: 0.0623, 7: 0.0, 8: 0.0048, 9: 0.0158, 10: 0.0275, 11: 0.003, 12: 0.0045, 13: 0.0084, 14: 0.0012, 15: 0.0, 16: 0.0011, 17: 0.0008, 18: 0.0002, 19: 0.0624})

# Save Trained Models

In [None]:
#https://www.sparkitecture.io/machine-learning/model-saving-and-loading
rfm.save("/mnt/trainedmodels/rfm")

                                                                                