# Data Loading

In [None]:
!gdown --id 1MIKKj8Gi-xUwhsYt6xEV6FSmX0_Le8iL
!unzip -q 'data-storm-20.zip'

Downloading...
From: https://drive.google.com/uc?id=1MIKKj8Gi-xUwhsYt6xEV6FSmX0_Le8iL
To: /content/data-storm-20.zip
  0% 0.00/1.23M [00:00<?, ?B/s]100% 1.23M/1.23M [00:00<00:00, 38.1MB/s]


# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler,Normalizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
#from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report,confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

!pip install bayesian-optimization



Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp37-none-any.whl size=11687 sha256=8f8e93e8580dcdae851134f49e88a9769917e89fbc4df72b012092cef6b67773
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Dataset function

In [None]:
def preprocessing_data(filename):

  df = pd.read_csv(filename)

  object_cols = ['Gender', 'Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Reservation_Status', 'Use_Promotion'] 

  dates = ['Expected_checkin', 'Expected_checkout', 'Booking_date']

  one_hot_encoded_lst = ['Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Deposit_type', 'Booking_channel'] 

  df = pd.get_dummies(df, columns=one_hot_encoded_lst) #one hot encoding

  df['Gender'] = df['Gender'].map({'F':0, 'M':1}) #categorising
  df['Visted_Previously'] = df['Visted_Previously'].map({'No':0, 'Yes':1})
  df['Previous_Cancellations'] = df['Previous_Cancellations'].map({'No':0, 'Yes':1})
  df['Required_Car_Parking'] = df['Required_Car_Parking'].map({'Yes':1, 'No':0})
  df['Use_Promotion'] = df['Use_Promotion'].map({'Yes':1, 'No':0})
  df['Reservation_Status'] = df['Reservation_Status'].map({'Check-In':0, 'Canceled':1, 'No-Show':2})

  df[dates[0]] = pd.to_datetime(df[dates[0]]) #dates engineering
  df[dates[1]] = pd.to_datetime(df[dates[1]])
  df[dates[2]] = pd.to_datetime(df[dates[2]])

  df['Expected_stay'] = (df[dates[1]] - df[dates[0]]).dt.days

  df['Booking_to_checkingin'] = (df[dates[0]] - df[dates[2]]).dt.days

  weekdayin = df[dates[0]].dt.dayofweek

  weekdayout = df[dates[1]].dt.dayofweek
  from pandas import DataFrame

  fina = []
  for x,y in zip(weekdayin, weekdayout):
    t = []
    if y >= x:
      for i in range(x, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
    else:
      for i in range(x, 7):
        t.append(i)
      for j in range(0, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
  xf = DataFrame (fina,columns=['weekend_stay'])
  df['weekend_stay'] = xf

  df['Month_of_stay'] = df[dates[0]].dt.month

  df = df.drop(columns=dates)   #scaling

  scale_lst = ['Age', 'Adults', 'Children', 'Babies', 'Discount_Rate', 'Room_Rate', 'Expected_stay', 'Booking_to_checkingin', 'Month_of_stay']

  df[scale_lst] = StandardScaler().fit_transform(df[scale_lst])

  X = df.drop(columns=['Reservation_Status', 'Reservation-id'])
  y = df['Reservation_Status']

  return X, y

In [None]:
def preprocessing_test(filename):

  df = pd.read_csv(filename)

  object_cols = ['Gender', 'Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking', 'Use_Promotion'] 

  dates = ['Expected_checkin', 'Expected_checkout', 'Booking_date']

  one_hot_encoded_lst = ['Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Deposit_type', 'Booking_channel'] 

  df = pd.get_dummies(df, columns=one_hot_encoded_lst) #one hot encoding

  df['Gender'] = df['Gender'].map({'F':0, 'M':1}) #categorising
  df['Visted_Previously'] = df['Visted_Previously'].map({'No':0, 'Yes':1})
  df['Previous_Cancellations'] = df['Previous_Cancellations'].map({'No':0, 'Yes':1})
  df['Required_Car_Parking'] = df['Required_Car_Parking'].map({'Yes':1, 'No':0})
  df['Use_Promotion'] = df['Use_Promotion'].map({'Yes':1, 'No':0})
  #df['Reservation_Status'] = df['Reservation_Status'].map({'Check-In':0, 'Canceled':1, 'No-Show':2})

  df[dates[0]] = pd.to_datetime(df[dates[0]]) #dates engineering
  df[dates[1]] = pd.to_datetime(df[dates[1]])
  df[dates[2]] = pd.to_datetime(df[dates[2]])

  df['Expected_stay'] = (df[dates[1]] - df[dates[0]]).dt.days

  df['Booking_to_checkingin'] = (df[dates[0]] - df[dates[2]]).dt.days

  weekdayin = df[dates[0]].dt.dayofweek

  weekdayout = df[dates[1]].dt.dayofweek
  from pandas import DataFrame

  fina = []
  for x,y in zip(weekdayin, weekdayout):
    t = []
    if y >= x:
      for i in range(x, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
    else:
      for i in range(x, 7):
        t.append(i)
      for j in range(0, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
  xf = DataFrame (fina,columns=['weekend_stay'])
  df['weekend_stay'] = xf

  df['Month_of_stay'] = df[dates[0]].dt.month

  df = df.drop(columns=dates)   #scaling

  scale_lst = ['Age', 'Adults', 'Children', 'Babies', 'Discount_Rate', 'Room_Rate', 'Expected_stay', 'Booking_to_checkingin', 'Month_of_stay']

  df[scale_lst] = StandardScaler().fit_transform(df[scale_lst])

  X = df.drop(columns=['Reservation-id'])
  #y = df['Reservation_Status']

  return X #, y

# Datasets

In [None]:
X_train, y_train = preprocessing_data('Hotel-A-train.csv')
X_train.shape, y_train.shape

((27499, 43), (27499,))

In [None]:
X_val, y_val = preprocessing_data('Hotel-A-validation.csv')
X_val.shape, y_val.shape

((2749, 43), (2749,))

In [None]:
X_test = preprocessing_test('Hotel-A-test.csv')
X_test.shape

(4318, 43)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)

# SMOTE

In [None]:
print("Before OverSampling, counts of label '0': {}".format(sum(y_train == 0))) 
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '2': {} \n".format(sum(y_train == 2))) 

# import SMOTE module from imblearn library 
# pip install imblearn (if you don't have imblearn in your system) 

over = SMOTE()
under = RandomUnderSampler()

#X_train_res, y_train_res = under.fit_sample(X_train, y_train.ravel()) 
X_train_res, y_train_res = over.fit_sample(X_train, y_train.ravel()) 

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 

print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '2': {}".format(sum(y_train_res == 2))) 

Before OverSampling, counts of label '0': 21240
Before OverSampling, counts of label '1': 4134
Before OverSampling, counts of label '2': 2125 





After OverSampling, the shape of train_X: (63720, 43)
After OverSampling, the shape of train_y: (63720,) 

After OverSampling, counts of label '0': 21240
After OverSampling, counts of label '1': 21240
After OverSampling, counts of label '2': 21240


# Ensemble

Under and over sampling

In [None]:
X_train.shape, X_val.shape

((27499, 30), (2749, 30))

In [None]:
classifiers=[['Logistic Regression :',LogisticRegression()],
       ['Decision Tree Classification :',DecisionTreeClassifier()],
       ['Gradient Boosting Classification :', GradientBoostingClassifier()],
       ['Ada Boosting Classification :',AdaBoostClassifier()],
       ['Extra Tree Classification :', ExtraTreesClassifier()],
       ['K-Neighbors Classification :',KNeighborsClassifier()]]
       #['Support Vector Classification :',SVC()],
       #['Gaussian Naive Bayes :',GaussianNB()]]

cla_pred=[]

for name,model in classifiers:

    model=model
    model.fit(X_train,y_train)
    predictions = model.predict(X_val)
    cla_pred.append(accuracy_score(y_val,predictions))

    print(name)
    print(confusion_matrix(y_val,predictions))
    print(classification_report(y_val,predictions))
    print(accuracy_score(y_val,predictions))
    print("*********************************************")
    print()

Logistic Regression :
[[1610    0    0]
 [ 741    0    0]
 [ 398    0    0]]
              precision    recall  f1-score   support

           0       0.59      1.00      0.74      1610
           1       0.00      0.00      0.00       741
           2       0.00      0.00      0.00       398

    accuracy                           0.59      2749
   macro avg       0.20      0.33      0.25      2749
weighted avg       0.34      0.59      0.43      2749

0.5856675154601674
*********************************************



  _warn_prf(average, modifier, msg_start, len(result))


Decision Tree Classification :
[[1179  278  153]
 [ 497  165   79]
 [ 278   82   38]]
              precision    recall  f1-score   support

           0       0.60      0.73      0.66      1610
           1       0.31      0.22      0.26       741
           2       0.14      0.10      0.11       398

    accuracy                           0.50      2749
   macro avg       0.35      0.35      0.35      2749
weighted avg       0.46      0.50      0.47      2749

0.5027282648235722
*********************************************



KeyboardInterrupt: ignored

Over sampling

In [None]:
classifiers=[['Logistic Regression :',LogisticRegression()],
       ['Decision Tree Classification :',DecisionTreeClassifier()],
       ['Gradient Boosting Classification :', GradientBoostingClassifier()],
       ['Ada Boosting Classification :',AdaBoostClassifier()],
       ['Extra Tree Classification :', ExtraTreesClassifier()],
       ['K-Neighbors Classification :',KNeighborsClassifier()],
       ['Support Vector Classification :',SVC()],
       ['Gaussian Naive Bayes :',GaussianNB()]]

cla_pred=[]

for name,model in classifiers:

    model=model
    model.fit(X_train_res,y_train_res)
    predictions = model.predict(X_val)
    cla_pred.append(accuracy_score(y_val,predictions))

    print(name)
    print(confusion_matrix(y_val,predictions))
    print(classification_report(y_val,predictions))
    print(accuracy_score(y_val,predictions))
    print("*********************************************")
    print()

Logistic Regression :
[[650 421 539]
 [271 190 280]
 [147 112 139]]
              precision    recall  f1-score   support

           0       0.61      0.40      0.49      1610
           1       0.26      0.26      0.26       741
           2       0.15      0.35      0.21       398

    accuracy                           0.36      2749
   macro avg       0.34      0.34      0.32      2749
weighted avg       0.45      0.36      0.38      2749

0.3561295016369589
*********************************************

Decision Tree Classification :
[[917 428 265]
 [398 213 130]
 [235  98  65]]
              precision    recall  f1-score   support

           0       0.59      0.57      0.58      1610
           1       0.29      0.29      0.29       741
           2       0.14      0.16      0.15       398

    accuracy                           0.43      2749
   macro avg       0.34      0.34      0.34      2749
weighted avg       0.44      0.43      0.44      2749

0.4347035285558385
********

# Gridsearch

In [None]:
# n_estimators = [10, 20, 50, 100]
# max_depth = [5,10,15,20]
hyperparameters = [{'criterion': ['entropy', 'gini'], 'max_depth': range(1, 100, 1)}]

dtmodel = DecisionTreeClassifier()
h_dtmodel = GridSearchCV(dtmodel, hyperparameters, cv=5, verbose=0, scoring='f1_macro')

best_logmodel = h_dtmodel.fit(X_train_res, y_train_res)

print('Best criterion:', best_logmodel.best_estimator_.get_params()['criterion'])
print('Best Max Depth:', best_logmodel.best_estimator_.get_params()['max_depth'])

KeyboardInterrupt: ignored

In [None]:
dtmodel = DecisionTreeClassifier()

dtmodel.fit(X_train, y_train)
predictions = dtmodel.predict(X_val)

print(confusion_matrix(y_val,predictions))
print(classification_report(y_val,predictions))
print(accuracy_score(y_val,predictions))

[[1191  269  150]
 [ 504  160   77]
 [ 273   82   43]]
              precision    recall  f1-score   support

           0       0.61      0.74      0.67      1610
           1       0.31      0.22      0.26       741
           2       0.16      0.11      0.13       398

    accuracy                           0.51      2749
   macro avg       0.36      0.35      0.35      2749
weighted avg       0.46      0.51      0.48      2749

0.5070934885412878


# Bayesian

In [None]:
# X_train_res.shape, y_train_res.shape

In [None]:
# X_train_res_pd = pd.DataFrame(X_train_res)
# y_train_res_pd = pd.DataFrame(y_train_res)

In [None]:
# X_train_res_pd.shape, y_train_res_pd.shape

In [None]:
# y_train_res_pd[0].shape

In [None]:
# #LGBMClassifier

# from sklearn.model_selection import StratifiedKFold
# from lightgbm import LGBMClassifier

# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# def lgbm_evaluate(**params):
# #     warnings.simplefilter('ignore')
    
#     params['num_leaves'] = int(params['num_leaves'])
#     params['max_depth'] = int(params['max_depth'])
        
#     clf = LGBMClassifier(**params, n_estimators=20000, nthread=-1)

#     test_pred_proba = np.zeros((X_train_res_pd.shape[0], 3))
    
#     for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train_res_pd, y_train_res_pd)):
#         X_train_bo, X_valid = X_train_res_pd.iloc[train_idx], X_train_res_pd.iloc[valid_idx]
#         y_train_bo, y_valid = y_train_res_pd[0].iloc[train_idx], y_train_res_pd[0].iloc[valid_idx]
        
#         model = LGBMClassifier(**params, n_estimators = 10000, n_jobs = -1)
#         model.fit(X_train_bo, y_train_bo, 
#                 eval_set=[(X_train_bo, y_train_bo), (X_valid, y_valid)], eval_metric='binary_logloss',
#                 verbose=False, early_stopping_rounds=200)

#         y_pred_valid = model.predict_proba(X_valid)

#         test_pred_proba[valid_idx] = y_pred_valid

#     return accuracy_score(y_valid, y_pred_valid.argmax(1))

In [None]:
# #hyper parameter tuning
# params = {'colsample_bytree': (0.8, 1),
#      'learning_rate': (.001, .01), 
#       'num_leaves': (8, 128), 
#       'subsample': (0.4, 1), 
#       'max_depth': (16, 32), 
#       # 'reg_alpha': (.05, 15.0), 
#       # 'reg_lambda': (.05, 15.0), 
#       'min_split_gain': (.001, .02),
#       'min_child_weight': (12, 80)}

# from sklearn.model_selection import StratifiedKFold
# n_fold = 20
# folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=11)

# from bayes_opt import BayesianOptimization
# bo = BayesianOptimization(lgbm_evaluate, params)
# bo.maximize(init_points=5, n_iter=5)

In [None]:
# params = {'num_leaves': int(bo.max['params']['num_leaves']), #parameter finding
#           'min_data_in_leaf': int(bo.max['params']['min_child_weight']),
#           'min_split_gain': bo.max['params']['min_split_gain'],
#           'objective': 'binary',
#           'max_depth': int(bo.max['params']['max_depth']),
#           'learning_rate': bo.max['params']['learning_rate'],
#           "boosting": "gbdt",
#           "bagging_freq": 5,
#           "bagging_fraction": bo.max['params']['subsample'],
#           "bagging_seed": 11,
#           "verbosity": -1,
#           # 'reg_alpha': bo.max['params']['reg_alpha'],
#           # 'reg_lambda': bo.max['params']['reg_lambda'],
#           "num_class": 1,
#           'nthread': -1
#          }

In [None]:
# params

In [None]:
# from lightgbm import LGBMClassifier #model fitting
# from sklearn.metrics import accuracy_score

# model = LGBMClassifier(**params, n_estimators = 100, n_jobs = -1)
# model.fit(X_train, y_train, 
#         eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='binary_logloss',
#         verbose=5000, early_stopping_rounds=1)

# predictions = model.predict(X_val)
# predictions_ = model.predict(X_train)

# print(accuracy_score(y_val, predictions), accuracy_score(y_train, predictions_))
# print(confusion_matrix(y_val, predictions))
# print(classification_report(y_val, predictions))

# Predictions

In [None]:
data_submission = pd.read_csv('Hotel-A-test.csv', index_col=0)
data_submission.head()

Unnamed: 0_level_0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,Booking_date,Adults,Children,Babies,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Use_Promotion,Discount_Rate,Room_Rate
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
62931593,F,52,Latino,Grad,25K --50K,South,City Hotel,11/18/2016,11/19/2016,10/28/2016,3,3,0,HB,No,No,No Deposit,Direct,Yes,Yes,10,153
70586099,F,47,Latino,Grad,25K --50K,East,Airport Hotels,11/18/2016,11/19/2016,8/6/2016,2,1,0,FB,No,No,No Deposit,Online,No,No,0,210
4230648,F,28,Asian American,Grad,<25K,East,City Hotel,4/28/2017,5/1/2017,4/8/2017,2,2,0,BB,No,No,No Deposit,Agent,No,Yes,5,117
25192322,F,65,caucasian,High-School,25K --50K,South,Airport Hotels,11/18/2016,11/20/2016,5/20/2016,1,3,2,FB,No,No,No Deposit,Online,Yes,Yes,10,107
80931528,M,45,African American,College,25K --50K,South,City Hotel,11/18/2016,11/20/2016,10/31/2016,3,1,0,BB,No,No,Refundable,Agent,No,No,0,119


In [None]:
submission_predictions = dtmodel.predict(X_test)

In [None]:
submission_predictions = submission_predictions.tolist()
submission_predictions = [x + 1 for x in submission_predictions]
submission_predictions.count(1), submission_predictions.count(2), submission_predictions.count(3)

(3042, 817, 459)

In [None]:
col_drop = data_submission.columns.tolist()

In [None]:
submission = data_submission.drop(col_drop, 1)

In [None]:
submission.head()

62931593
70586099
4230648
25192322
80931528


In [None]:
submission['Reservation_status'] = pd.DataFrame(submission_predictions,columns=['Reservation_status'])['Reservation_status'].values

In [None]:
submission.head()

Unnamed: 0_level_0,Reservation_status
Reservation-id,Unnamed: 1_level_1
62931593,1
70586099,3
4230648,1
25192322,1
80931528,2


In [None]:
submission.to_csv('submission-2-day3_Bimsara3.csv')

In [None]:
submission['Reservation_status'].value_counts()

1    3042
2     817
3     459
Name: Reservation_status, dtype: int64