In [0]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import matplotlib
import matplotlib.pyplot as plt 
from matplotlib import style
%matplotlib inline

# Algorithms
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import preprocessing

# import cross validation and other evaluation tool 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# Evaulation tools
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib as mpl
from sklearn.metrics import precision_recall_curve

# set display options
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 300
fig=plt.figure(figsize=(18,6))

NameError: name 'plt' is not defined

In [0]:
# load dataset
data = pd.read_csv("Hotel_Bookings_Prepared.csv", index_col = "Unnamed: 0")

In [0]:
# feature selection
features = ['lead_time',
            'stays_in_weekend_nights',
            'stays_in_week_nights',
            'adults',
            'children',
            'babies',
            'is_repeated_guest',
            'previous_cancellations',
            'previous_bookings_not_canceled',
            'booking_changes',
            'days_in_waiting_list',
            'adr',
            'required_car_parking_spaces',
            'total_of_special_requests',
            'is_company',
            'is_agent',
            'is_diff_room_type',
            'is_holiday',
            'hotel_City Hotel',
            'hotel_Resort Hotel',
            'arrival_date_month_April',
            'arrival_date_month_August',
            'arrival_date_month_December',
            'arrival_date_month_February',
            'arrival_date_month_January',
            'arrival_date_month_July',
            'arrival_date_month_June',
            'arrival_date_month_March',
            'arrival_date_month_May',
            'arrival_date_month_November',
            'arrival_date_month_October',
            'arrival_date_month_September',
            'meal_BB',
            'meal_FB',
            'meal_HB',
            'meal_SC',
            'meal_Undefined',
            'market_segment_Aviation',
            'market_segment_Complementary',
            'market_segment_Corporate',
            'market_segment_Direct',
            'market_segment_Groups',
            'market_segment_Offline TA/TO',
            'market_segment_Online TA',
            'market_segment_Undefined',
            'distribution_channel_Corporate',
            'distribution_channel_Direct',
            'distribution_channel_GDS',
            'distribution_channel_TA/TO',
            'distribution_channel_Undefined',
            'customer_type_Contract',
            'customer_type_Group',
            'customer_type_Transient',
            'customer_type_Transient-Party',
            'deposit_type_No Deposit',
            'deposit_type_Non Refund',
            'deposit_type_Refundable',
            'arrival_date_day_of_the_week_Friday',
            'arrival_date_day_of_the_week_Monday',
            'arrival_date_day_of_the_week_Saturday',
            'arrival_date_day_of_the_week_Sunday',
            'arrival_date_day_of_the_week_Thursday',
            'arrival_date_day_of_the_week_Tuesday',
            'arrival_date_day_of_the_week_Wednesday',
            'continent_Africa',
            'continent_Antarctica',
            'continent_Asia',
            'continent_Australia',
            'continent_Europe',
            'continent_North America',
            'continent_South America']

In [0]:
# define model features and label
X = data[features]
# change target column to array
y = data['is_canceled'].values.ravel()

In [0]:
# Split data - 40% for testing, 60% for training
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=2)

In [0]:
# Run trial - use model (LRM_trial) with default parameters
LRM_trial = LogisticRegression()
LRM_trial.fit(X_train,y_train)

# LRM_trial - show top 15 coefficient of the features in descending order of absolute value 
LRM_trial_coefficient = pd.DataFrame(LRM_trial.coef_.flatten(), X_train.columns, columns=['Coefficient'])  
LRM_trial_coefficient.iloc[LRM_trial_coefficient['Coefficient'].abs().argsort()][::-1].head(15) 

# LRM_trial - predict labels in testing set
LRM_trial_y_pred = LRM_trial.predict(X_test) # LRM_trial_y_pred = predicted labels from testing set features (X_test)

# LRM_trial - predict probability in testing set
LRM_trial_y_pred_prob = LRM_trial.predict_proba(X_test) ## LRM_trial_y_pred_prob = the probabilistic predictions

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
# Performance of LRM_trial
print('Performance of model - LRM_trial')
print()
print("Accuracy: \n",accuracy_score(y_test,LRM_trial_y_pred,normalize=True,sample_weight=None))
print()
print("Classification Report: \n",classification_report(y_test,LRM_trial_y_pred))
print()
print("Confusion Matrix: \n",confusion_matrix(y_test,LRM_trial_y_pred))
print()
precision, recall, _ = precision_recall_curve(y_test, LRM_trial_y_pred_prob[:,0])
print("PR-AUC: \n",auc(recall, precision))
print()
fpr, tpr, thresholds = metrics.roc_curve(y_test,LRM_trial_y_pred_prob[:,0],pos_label=0)
print('ROC-AUC: \n',metrics.auc(fpr, tpr))
print()

Performance of model - LRM_trial

Accuracy: 
 0.8045271798308066

Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.92      0.86     30163
           1       0.82      0.61      0.70     17593

    accuracy                           0.80     47756
   macro avg       0.81      0.76      0.78     47756
weighted avg       0.81      0.80      0.80     47756


Confusion Matrix: 
 [[27770  2393]
 [ 6942 10651]]

PR-AUC: 
 0.22605268355400862

ROC-AUC: 
 0.8589520169348955



In [0]:
# Tuning Parameters with GridSearchCV()
try_grid = {
            'penalty':['l1', 'l2', 'elasticnet'],
            'dual':[False],
            'tol':[1e-4],
            'C':[0.8,1.0,1.2,1.4,1.6,2.0],
            'fit_intercept':[True],
            'intercept_scaling':[1],
            'class_weight':[None],
            'random_state':[None],
            'solver':['saga'],
            'max_iter':[4000],
            'multi_class':['ovr'],
            'verbose':[0],
            'warm_start':[False],
            'l1_ratio':[0.5]
           }
LRM = GridSearchCV(LogisticRegression(), param_distributions=try_grid, cv=10,n_jobs=-1 )

NameError: name 'GridSearchCV' is not defined

In [0]:
# Fit to training dataset
LRM.fit(X_train,y_train)
# Check the best score and the best parameters
print(LRM.best_score_)
print(LRM.best_params_)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='warn', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='warn', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributio...
                                        'class_weight': [None], 'dual': [False],
                                        'fit_intercept': [True],
                                        'intercept_scaling': [1],
                     

In [0]:
# See the best score and the best parameters
print(LRM.best_score_)
print(LRM.best_params_)

0.8143339754865008
{'warm_start': False, 'verbose': 0, 'tol': 0.0001, 'solver': 'saga', 'random_state': None, 'penalty': 'l1', 'n_jobs': -1, 'multi_class': 'ovr', 'max_iter': 4000, 'l1_ratio': 0.5, 'intercept_scaling': 1, 'fit_intercept': True, 'dual': False, 'class_weight': None, 'C': 2.0}


In [0]:
# Define the model with the best parameters obtained
# 'C': 2.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': 0.5, 'max_iter': 4000, 'multi_class': 'ovr', 'penalty': 'l1', 'random_state': None, 'solver': 'saga', 'tol': 0.0001, 'verbose': 0, 'warm_start': False
LRM = LogisticRegression(C=2.0,
                         class_weight=None, 
                         dual=False,
                         fit_intercept=True,
                         intercept_scaling=1,
                         l1_ratio=0.5,
                         max_iter=4000,
                         multi_class='ovr',
                         penalty='l1',
                         random_state=None,
                         solver='saga', 
                         tol=0.0001,
                         verbose=0,
                         warm_start=False)

In [0]:
# Get average acuracy rate of the model performacne on traingin dataset
score_cv = cross_val_score(LRM, X_train, y_train, cv=10)
score_cv.mean()

In [0]:
# LRM - fit model
LRM.fit(X_train,y_train)

# LRM - show top 15 coefficient of the features in descending order of absolute value 
LRM_coefficient = pd.DataFrame(LRM.coef_.flatten(), X_train.columns, columns=['Coefficient'])  
LRM_coefficient.iloc[LRM_coefficient['Coefficient'].abs().argsort()][::-1].head(15)

# LRM - predict labels in testing set
y_pred = LRM.predict(X_test) # y_pred = predicted labels from testing set features (X_test)

# LRM - predict probability in testing set
y_pred_prob = LRM.predict_proba(X_test) # y_pred_prob = the probabilistic predictions

  "(penalty={})".format(self.penalty))


In [0]:
# LRM - show coefficient of the features
LRM_coefficient = pd.DataFrame(LRM.coef_.flatten(), X_train.columns, columns=['Coefficient'])  
#LRM_coefficient.loc[LRM1_coefficient['Coefficient'] != 0]
LRM_coefficient.iloc[LRM_coefficient['Coefficient'].abs().argsort()][::-1].head(15)

Unnamed: 0,Coefficient
previous_cancellations,72.647677
required_car_parking_spaces,-48.873649
previous_bookings_not_canceled,-35.193844
adr,23.355546
booking_changes,-7.759137
deposit_type_Non Refund,4.898585
adults,4.009538
lead_time,2.880451
stays_in_week_nights,2.184289
is_diff_room_type,-1.796561


In [0]:
# Performance of LRM
print('Performance of model - LRM')
print()
print("Accuracy: \n",accuracy_score(y_test,y_pred,normalize=True,sample_weight=None))
print()
print("Classification Report: \n",classification_report(y_test,y_pred))
print()
print("Confusion Matrix: \n",confusion_matrix(y_test,y_pred))
print()
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob[:,0])
print("PR-AUC: \n",auc(recall, precision))
print()
fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred_prob[:,0],pos_label=0)
print('ROC-AUC: \n',metrics.auc(fpr, tpr))
print('----------------------------------------------------------------------')

# Performance of LRM_trial
print('Performance of model - LRM_trial')
print()
print("Accuracy: \n",accuracy_score(y_test,LRM_trial_y_pred,normalize=True,sample_weight=None))
print()
print("Classification Report: \n",classification_report(y_test,LRM_trial_y_pred))
print()
print("Confusion Matrix: \n",confusion_matrix(y_test,LRM_trial_y_pred))
print()
precision_trial, recall_trial, _trial = precision_recall_curve(y_test, LRM_trial_y_pred_prob[:,0])
print("PR-AUC: \n",auc(recall_trial, precision_trial))
print()
fpr_trial, tpr_trial, thresholds_trial = metrics.roc_curve(y_test,LRM_trial_y_pred_prob[:,0],pos_label=0)
print('ROC-AUC: \n',metrics.auc(fpr_trial, tpr_trial))
print('----------------------------------------------------------------------')


# Introduce majority classifier as the benchmark model
pred_val_maj = np.zeros(y_test.shape, dtype = int)

# Evaluation Tools Majority Classifier
print('Performance of majority classifier')
print()
print("Accuracy: \n",accuracy_score(y_test,pred_val_maj,normalize=True,sample_weight=None))
print()
print("Classification Report: \n",classification_report(y_test,pred_val_maj))
print()
print("Confusion Matrix: \n",confusion_matrix(y_test,pred_val_maj))

Performance of model - LRM

Accuracy: 
 0.8166094312756512

Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.92      0.86     30163
           1       0.83      0.63      0.72     17593

    accuracy                           0.82     47756
   macro avg       0.82      0.78      0.79     47756
weighted avg       0.82      0.82      0.81     47756


Confusion Matrix: 
 [[27866  2297]
 [ 6461 11132]]

PR-AUC: 
 0.2245094221906389

ROC-AUC: 
 0.868196511792926
----------------------------------------------------------------------
Performance of model - LRM_trial

Accuracy: 
 0.8045271798308066

Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.92      0.86     30163
           1       0.82      0.61      0.70     17593

    accuracy                           0.80     47756
   macro avg       0.81      0.76      0.78     47756
weighted avg       0.81      0.80      0.8

In [0]:
# ROC of the best model (LRM)
plt.plot(fpr,tpr)

NameError: name 'plt' is not defined