In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import datetime
import warnings
sys.path.insert(0,'../')
%load_ext autoreload
%autoreload 2
from utils import nyctaxi_helpers
warnings.filterwarnings('ignore')

In [2]:
dt = pd.read_csv('../datasets/nyctaxi/green_tripdata_2017-06.csv.gz')
# sneak peak in the data
dt.head(2)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2017-06-01 00:33:45,2017-06-01 01:39:52,N,4,35,265,1,90.41,404.5,0.5,0.5,0.0,5.76,,0.3,413.51,2,1.0
1,2,2017-06-01 00:33:55,2017-06-01 23:36:23,N,1,263,75,1,0.38,4.0,0.5,0.5,1.59,0.0,,0.3,6.89,1,1.0


In [3]:
dt = nyctaxi_helpers.preprocessing(dt)
# let's take a look at the processed data
dt.head(2)

Unnamed: 0,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type,d_hour_of_day,p_hour_of_day,pickup_dropoff_diff,day_of_week,tip_fair_class
0,N,4,35,265,1,90.41,404.5,0.5,0.5,5.76,0.3,413.51,2,1.0,1,0,8.285765,3,0.0
1,N,1,263,75,1,0.38,4.0,0.5,0.5,0.0,0.3,6.89,1,1.0,23,0,11.325969,3,1.0


In [4]:
X = nyctaxi_helpers.convert_categorical(dt)
# replace all inf, -inf with zeros
X.replace([np.inf, -np.inf], np.nan,inplace=True)
X.fillna(0,inplace=True)

In [5]:
import math
models = {}
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

dtmodel = DecisionTreeClassifier(random_state = 100, class_weight="balanced")
gnbmodel = GaussianNB()
models.update({'Decision Tree':dtmodel})
models.update({'Gaussian Naive Bayes':gnbmodel})

In [6]:
# we use 10-fold cross-validation to run the model
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
y_cls = X['tip_fair_class']
XX_cls = X.drop('tip_fair_class',axis = 1)
# model name which could be any of the following
# sorted by the amount of time they take to train the model:
# GradientBoost, XGBoost, RandomForest, Bagging, Linear
for model_name, model in models.items():
    print("Running the {} Classifier...".format(model_name))
    scores =[]
    x_train, x_test, y_train, y_test = train_test_split(XX_cls, y_cls, test_size=0.33, random_state=42)
    y_pred = nyctaxi_helpers.run_classifier(model,x_train, y_train, x_test)
    (precision, recall, fscore, support) = precision_recall_fscore_support(y_test, y_pred, average='macro')
    print('Precision: {}\tRecall: {}\tfscore:{}'.format(precision, recall, fscore))


Running the Decision Tree Classifier...
Precision: 0.9753404828441747	Recall: 0.9788809302657342	fscore:0.9771029845571908
Running the Gaussian Naive Bayes Classifier...
Precision: 0.5681931746282897	Recall: 0.7660265862601382	fscore:0.4893861087949189
