In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading data
total_data = pd.read_csv('hotel_bookings_ohe.csv')
train_index = pd.read_csv('train_index.csv')

In [3]:
#select train data from training index
train = total_data.iloc[train_index.Train_Index,:]
train.reset_index(inplace=True)
train.head()

Unnamed: 0,index,arrival_date_year,adults,children,babies,previous_cancellations,booking_changes,total_of_special_requests,log_lead_time,total_nights,...,market_segment_Online TA,market_segment_Undefined,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,required_car_parking_0,required_car_parking_1,is_canceled
0,35564,2017,2,0,0,0,0,1,4.663439,2,...,1,0,0,0,1,0,0,1,0,0
1,24402,2016,1,0,0,0,0,0,0.0,1,...,0,0,1,0,0,0,0,1,0,0
2,117756,2017,2,1,1,0,1,1,5.135798,7,...,1,0,1,0,0,0,0,1,0,0
3,5653,2016,2,0,0,0,0,0,3.044522,2,...,0,0,1,0,0,0,0,1,0,1
4,105241,2017,1,0,0,0,0,0,0.693147,1,...,0,0,1,0,0,0,0,1,0,0


In [4]:
#select test data
test = total_data.iloc[total_data.index.difference(train_index.Train_Index),:]
test.reset_index(inplace=True)
test.head()


Unnamed: 0,index,arrival_date_year,adults,children,babies,previous_cancellations,booking_changes,total_of_special_requests,log_lead_time,total_nights,...,market_segment_Online TA,market_segment_Undefined,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,required_car_parking_0,required_car_parking_1,is_canceled
0,2,2015,1,0,0,0,0,0,2.079442,1,...,0,0,1,0,0,0,0,1,0,0
1,3,2015,1,0,0,0,0,0,2.639057,1,...,0,0,1,0,0,0,0,1,0,0
2,8,2015,2,0,0,0,0,1,4.454347,3,...,1,0,1,0,0,0,0,1,0,1
3,9,2015,2,0,0,0,0,0,4.330733,3,...,0,0,0,0,1,0,0,1,0,1
4,10,2015,2,0,0,0,0,0,3.178054,4,...,1,0,1,0,0,0,0,1,0,1


In [5]:
#select train x, train y, test x, test y
train_x = train.drop(['index','is_canceled'],axis=1)
train_y = train['is_canceled']

test_x = test.drop(['index','is_canceled'],axis=1)
test_y = test['is_canceled']


# Naive Beyes Classifier - Gaussian Naive Bayes Algorithm

In [6]:
# standardization for train and test data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() #we can use RobustScaler() to avoid the outlier influence
train_x  = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [7]:
#cross validation and grid search for parameters optimization

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.datasets import make_classification

gnb = GaussianNB() #define model
param_grid = {'var_smoothing': [1e-10,1e-9, 1e-8, 1e-7, 1e-6, 1e-5,1e-4,1e-3,1e-2,1e-1]} #grid search to select the best parameters

grid_search = GridSearchCV(gnb, param_grid, cv=10, scoring='roc_auc') #10 fold cross validation by using AUC as a decision point
grid_search.fit(train_x, train_y)


GridSearchCV(cv=10, estimator=GaussianNB(),
             param_grid={'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06,
                                           1e-05, 0.0001, 0.001, 0.01, 0.1]},
             scoring='roc_auc')

In [8]:
#the best model in Naive Bayes Classifiers
print('Best hyperparameters:', grid_search.best_params_)
print('Best cross-validation AUC score:', grid_search.best_score_)


Best hyperparameters: {'var_smoothing': 0.01}
Best cross-validation AUC score: 0.8301650451392858


In [9]:
#retrain  Naive Bayes Classifiers, and predict test set

gnb_retrain = GaussianNB(var_smoothing=grid_search.best_params_['var_smoothing'])
gnb_retrain.fit(train_x, train_y)
y_pred = gnb_retrain.predict(test_x)

# Print the Confusion Matrix and slice it into four pieces
from sklearn.metrics import confusion_matrix
import seaborn as sns 
cm = confusion_matrix(test_y, y_pred)
cm


array([[7012, 8037],
       [ 793, 8035]])

In [10]:
#classification metrices in test set [f1-score]
from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.47      0.61     15049
           1       0.50      0.91      0.65      8828

    accuracy                           0.63     23877
   macro avg       0.70      0.69      0.63     23877
weighted avg       0.75      0.63      0.63     23877



In [11]:
# Compute the misclassification rate
cm = confusion_matrix(test_y, y_pred)

misclass_rate = (cm[0,1] + cm[1,0]) / sum(sum(cm))

print("Misclassification rate in test dataset: ", misclass_rate)

Misclassification rate in test dataset:  0.3698119529254094


In [12]:
#AUC performance in test set
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(test_y, y_pred)
print('ROC AUC in test dataset : {:.4f}'.format(ROC_AUC))


ROC AUC in test dataset : 0.6881


In [None]:
#