# MSiA 420 - Predictive Analytics II - Final Project
## Group 6: Alejandra Lelo de Larrea Ibarra, Kiran Jyothi Sheena, Lixuan (Ellen) Chen, Wencheng Zhang

# Random Forest


In [1]:
# Libraries
import numpy as np
import pandas as pd
import zipfile

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns

import statsmodels.api as sm
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix 

## Read data

In [2]:
## Reading the csv file directly from the zip file without extracting it
archive = zipfile.ZipFile('../02_Data/hotel_bookings_ohe.csv.zip', 'r')
files = archive.namelist()
with archive.open(files[0]) as csvfile:   
    df = pd.read_csv(csvfile)

df.head()

Unnamed: 0,arrival_date_year,adults,children,babies,previous_cancellations,booking_changes,total_of_special_requests,log_lead_time,total_nights,previous_bookings,...,market_segment_Online TA,market_segment_Undefined,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,required_car_parking_0,required_car_parking_1,is_canceled
0,2015,2,0,0,0,3,0,5.83773,0,0,...,0,0,1,0,0,0,0,1,0,0
1,2015,2,0,0,0,4,0,6.603944,0,0,...,0,0,1,0,0,0,0,1,0,0
2,2015,1,0,0,0,0,0,2.079442,1,0,...,0,0,1,0,0,0,0,1,0,0
3,2015,1,0,0,0,0,0,2.639057,1,0,...,0,0,1,0,0,0,0,1,0,0
4,2015,2,0,0,0,0,1,2.70805,2,0,...,1,0,1,0,0,0,0,1,0,0


In [3]:
# Number of observations and features
df.shape

(119388, 66)

In [4]:
# descriptive stats 
df.describe()

Unnamed: 0,arrival_date_year,adults,children,babies,previous_cancellations,booking_changes,total_of_special_requests,log_lead_time,total_nights,previous_bookings,...,market_segment_Online TA,market_segment_Undefined,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,required_car_parking_0,required_car_parking_1,is_canceled
count,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,...,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0,119388.0
mean,2016.156548,1.856401,0.103888,0.007949,0.087119,0.221103,0.571372,3.840079,3.427865,0.224202,...,0.473054,1.7e-05,0.773177,0.006684,0.121143,0.089205,0.009792,0.937883,0.062117,0.370414
std,0.707478,0.579266,0.398558,0.097437,0.844343,0.652287,0.792802,1.609797,2.55738,1.827964,...,0.499275,0.004093,0.418779,0.081483,0.326295,0.285041,0.098467,0.241369,0.241369,0.482918
min,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2016.0,2.0,0.0,0.0,0.0,0.0,0.0,2.944439,2.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,2016.0,2.0,0.0,0.0,0.0,0.0,0.0,4.248495,3.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2017.0,2.0,0.0,0.0,0.0,0.0,1.0,5.081404,4.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,2017.0,55.0,10.0,10.0,26.0,21.0,5.0,6.603944,69.0,78.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Featrue names 
df.columns

Index(['arrival_date_year', 'adults', 'children', 'babies',
       'previous_cancellations', 'booking_changes',
       'total_of_special_requests', 'log_lead_time', 'total_nights',
       'previous_bookings', 'log_days_in_waiting_list', 'log_adr',
       'arrival_month_1', 'arrival_month_2', 'arrival_month_3',
       'arrival_month_4', 'arrival_month_5', 'arrival_month_6',
       'arrival_month_7', 'arrival_month_8', 'arrival_month_9',
       'arrival_month_10', 'arrival_month_11', 'arrival_month_12',
       'booked_by_agent_no', 'booked_by_agent_yes', 'booked_by_company_0',
       'booked_by_company_1', 'continent_Africa', 'continent_Americas',
       'continent_Antarctica', 'continent_Asia', 'continent_Europe',
       'continent_Oceania', 'continent_unknown', 'customer_type_Contract',
       'customer_type_Group', 'customer_type_Transient',
       'customer_type_Transient-Party', 'deposit_type_No Deposit',
       'deposit_type_Non Refund', 'deposit_type_Refundable',
       'domestic_

## Splitting trianing and test

In [6]:
# get train indexes
train_ind = pd.read_csv("../02_Data/train_index.csv")

# split train
train = df.iloc[train_ind.Train_Index,:]
train.reset_index(inplace = True, drop = True)
xtrain = train.drop("is_canceled", axis = 1)
ytrain = train.is_canceled

# Split test 
test = df.iloc[df.index.difference(train_ind.Train_Index),:]
xtest = test.drop("is_canceled", axis = 1)
ytest = test.is_canceled

In [7]:
# Check class imbalance in train and test 
print("Train Class Response:")
print(train.is_canceled.value_counts()/train.shape[0]*100)

print("Test Class Response:")
print(test.is_canceled.value_counts()/test.shape[0]*100)

Train Class Response:
0    62.941441
1    37.058559
Name: is_canceled, dtype: float64
Test Class Response:
0    63.027181
1    36.972819
Name: is_canceled, dtype: float64


## Fit Random Forest with sklearn

In [None]:
# Create random forest object 
rf = RandomForestClassifier(
    criterion='gini',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=190322)

# Define hyperparameters 
n_estimators = [50, 100, 150, 200, 250, 500, 1000]
min_samples_leaf = [2,3,4,5]
max_features = [1,2,3,4,5]

# dictionary of hyperparams 
hyperparms = dict(n_estimators = n_estimators, 
                 min_samples_leaf = min_samples_leaf, 
                 max_features = max_features)

# Use gridsearch to fit the model 
cv_hyper = GridSearchCV(rf, hyperparms, cv = 10, verbose = 0)
best_mod = cv_hyper.fit(xtrain, ytrain)
                                                                                                                                                                                                                             



In [None]:
# Print best hyperparams 
print("Best number of trees (n_estimators):", best_mod.best_estimator_.get_params()['n_estimators'])
print("Best node size (min_samples_leaf):", best_mod.best_estimator_.get_params()['min_samples_leaf'])
print("Best number of features (max_features):", best_mod.best_estimator_.get_params()['max_features'])

## Train best model

In [None]:
# Set CV
kfold = KFold(nsplits = 10, random_state = 190322, shuffle = True)

# create random forest object with best hyperparams
rf_model = RandomForestClassifier(
    n_estimators = best_mod.best_estimator_.get_params()['n_estimators'], 
    min_samples_leaf = best_mod.best_estimator_.get_params()['min_samples_leaf'], 
    max_features = best_mod.best_estimator_.get_params()['max_features'], 
    criterion='gini',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=190322)

# Estimate model 
rf_model.fit(xtrain, ytrain)

## Predict test set

In [None]:
# Predictions for test set 
yhat_test = rf_model.predict(xtest)

In [None]:
# --- Evaluation Metrics---
conf_matrix = confusion_matrix(ytest, yhat_test)
print('--- EVALUATION METRICS ---:')
print('\nConfusion Matrix:')
print(conf_matrix)

print('\nUsing 0.5 as threshold:')
print('Accuracy = {:.5f}'.format(accuracy_score(ytest, yhat_test)))
print('Precision = {:.5f}'.format(precision_score(ytest, yhat_test)))
print('Recall = {:.5f}'.format(recall_score(ytest, yhat_test)))
print('F1 score = {:.5f}'.format(f1_score(ytest, yhat_test)))

In [None]:
# ROC curve
plot_roc_curve(mod, xtest, ytest)

# Precision-recall curve 
plot_precision_recall_curve(mod, xtest, ytest)