# Classification of Tippers vs Non-Tippers with Tree Models

## Load necessary libraries

In [129]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

## Load data

In [90]:
# Load data
data = pd.read_csv("../../cleaned_data.csv", index_col = 0)

## Preparing the data for the model

In [91]:
data.dtypes

duration_seconds               float64
miles                          float64
fare                           float64
tip                            float64
tolls                          float64
extra_charges                  float64
trip_total                     float64
payment_type                    object
start_month                      int64
start_weekday                   object
speed_pickup_start             float64
speed_category_pickup_start     object
bus_count_pickup_start         float64
gps_pings_pickup_start         float64
speed_dropoff_end              float64
speed_category_dropoff_end      object
bus_count_dropoff_end          float64
gps_pings_dropoff_end          float64
region_pickup                   object
region_dropoff                  object
tip_flag                         int64
ride_type                       object
flag_overnight                   int64
total_no_tip                   float64
start_hour                       int64
flag_weekend             

In [92]:
# Separate categorical variables
cat_vars = ["payment_type", "start_weekday", "speed_category_pickup_start", "speed_category_dropoff_end", "region_pickup", 
           "region_dropoff", "tip_flag", "ride_type", "flag_overnight", "flag_weekend"]

In [93]:
# Convert into type categorial
for var in cat_vars:
    data[var] = data[var].astype('category')

In [94]:
# Possible predictors
feature_names = ["duration_seconds", "miles", "payment_type", "start_month", "start_weekday", 
                 "speed_pickup_start", "speed_dropoff_end", "ride_type", "flag_overnight", "total_no_tip", 
                 "start_hour", "flag_weekend"]
# Possible target
target_name = ["tip_flag"]

## Encode categorical features

Variables to encode:

* Payment Type
* Start Weekday
* Ride Type

### Encode Payment Type

In [95]:
# Encode categorical variables with LabelEncoder (they are not ordinal)
le = LabelEncoder()

In [96]:
le.fit(data["payment_type"])

LabelEncoder()

In [97]:
list(le.classes_)

['Card', 'Cash', 'Mobile', 'Other']

In [98]:
data["payment_type_encoded"] = le.transform(data["payment_type"])

In [99]:
data["payment_type_encoded"].unique()

array([1, 0, 2, 3])

### Encode Start Weekday

In [100]:
data.start_weekday

0             Friday
1          Wednesday
2             Sunday
3            Tuesday
4           Thursday
             ...    
5536739       Sunday
5536740    Wednesday
5536741     Thursday
5536742       Friday
5536743     Saturday
Name: start_weekday, Length: 5536622, dtype: category
Categories (7, object): [Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday]

In [101]:
weekday_map = {key:value for key, value in zip(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], range(1,9))}

In [102]:
weekday_map

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [103]:
data["start_weekday"]

0             Friday
1          Wednesday
2             Sunday
3            Tuesday
4           Thursday
             ...    
5536739       Sunday
5536740    Wednesday
5536741     Thursday
5536742       Friday
5536743     Saturday
Name: start_weekday, Length: 5536622, dtype: category
Categories (7, object): [Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday]

In [104]:
data["start_weekday_encoded"] = data["start_weekday"].map(weekday_map)

In [105]:
data["start_weekday_encoded"].head()

0    5
1    3
2    7
3    2
4    4
Name: start_weekday_encoded, dtype: category
Categories (7, int64): [5, 1, 6, 7, 4, 2, 3]

## Encode Ride Type

In [113]:
# Create map
ride_map = {"Taxi": 1, "Rideshare": 2}

In [114]:
data["ride_type_encoded"] = data["ride_type"].map(ride_map)

In [115]:
data["ride_type_encoded"].head()

0    1
1    1
2    1
3    1
4    1
Name: ride_type_encoded, dtype: category
Categories (2, int64): [2, 1]

## Select features and target

In [116]:
# Predictors
feature_names = ["duration_seconds", "miles", "payment_type_encoded", "start_month", "start_weekday_encoded", 
                 "speed_pickup_start", "speed_dropoff_end", "ride_type_encoded", "flag_overnight", "total_no_tip", 
                 "start_hour", "flag_weekend"]
# Predicted variable
target_name = ["tip_flag"]

## Split into Train and Test

In [117]:
# Separate into x and y
x = data[feature_names]
y = data[target_name]

In [118]:
# Split into train and test samples
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 232323, train_size = 0.7)

In [119]:
x_train.head()

Unnamed: 0,duration_seconds,miles,payment_type_encoded,start_month,start_weekday_encoded,speed_pickup_start,speed_dropoff_end,ride_type_encoded,flag_overnight,total_no_tip,start_hour,flag_weekend
5076741,413.0,1.0,2,1,6,21.408,21.326,2,0,7.55,23,1
5444706,972.0,5.3,2,8,2,22.021667,18.976667,2,0,12.55,21,0
4790627,417.0,1.8,2,8,7,25.62,25.62,2,0,7.55,0,1
955347,448.0,1.1,2,9,1,18.001667,19.788333,2,0,7.55,18,0
4634742,388.0,1.3,2,4,6,19.396667,0.0,2,0,5.05,3,1


## Training a Classification Tree with Cross Validation, on Train sample

In [120]:
# Create dictionary with parameters to search and its values (in this case, only max_depth of the tree)
parameters = {'max_depth':range(1,10)}
# Search over the specified values of the parameters, for an estimator (in this case, DecisionTreeClassifier)
# n_jobs indicates how many jobs to run in parallel
tip_tree = GridSearchCV(estimator = tree.DecisionTreeClassifier(), param_grid = parameters, n_jobs=4)

In [121]:
# Fit the model with cross validation to estimate best max_depth
tip_tree.fit(X = x_train, y = y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=4, param_grid={'max_depth': range(1, 10)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scori

In [124]:
tree_model = tip_tree.best_estimator_

In [125]:
print (tip_tree.best_score_, tip_tree.best_params_) 

0.8374305113871663 {'max_depth': 6}


## Results  
https://www.datacamp.com/community/tutorials/decision-tree-classification-python

### Accuracy in Train

In [127]:
# Accuracy in train set
tip_tree.score(x_train, y_train)

0.8374488309657643

### Accuracy in Test

In [135]:
# Accuracy in test set
tip_tree.score(x_test, y_test)

0.8371877684774174

### Confusion Matrix

In [136]:
y_hat_test = tree_model.predict(x_test)

In [137]:
print(metrics.confusion_matrix(y_hat_test, y_test))

[[1286891  263780]
 [   6649  103667]]


### Classification Report

In [134]:
print(metrics.classification_report(y_hat_test, y_test))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90   1550671
           1       0.28      0.94      0.43    110316

    accuracy                           0.84   1660987
   macro avg       0.64      0.88      0.67   1660987
weighted avg       0.95      0.84      0.87   1660987



In [145]:
# Precision: True positive / True positive + False positive (people that really tipped out of all the ones we said would tip)
round(metrics.precision_score(y_hat_test, y_test), 2)
# Precision is very bad

0.28

In [146]:
# Recall or Sensitivity: True positive / True Positive + False negative (identify people that tip out of all that actually tipped)
round(metrics.recall_score(y_hat_test, y_test), 2)
# Recall is very good

0.94

In [147]:
# F1 score: harmonic mean between precision and recall
round(metrics.f1_score(y_hat_test, y_test), 2)

0.43