Downloading the Dataset

In [1]:
!gdown --id 1MIKKj8Gi-xUwhsYt6xEV6FSmX0_Le8iL
!unzip -q 'data-storm-20.zip'

Downloading...
From: https://drive.google.com/uc?id=1MIKKj8Gi-xUwhsYt6xEV6FSmX0_Le8iL
To: /content/data-storm-20.zip
  0% 0.00/1.23M [00:00<?, ?B/s]100% 1.23M/1.23M [00:00<00:00, 77.5MB/s]


### Preparing Data & Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data_train = pd.read_csv('Hotel-A-train.csv', index_col=0)
data_validation = pd.read_csv('Hotel-A-validation.csv', index_col = 0)

In [4]:
data_train.shape, data_validation.shape

((27499, 23), (2749, 23))

In [5]:
data_train = data_train.append(data_validation)

In [6]:
data_train.shape

(30248, 23)

In [7]:
data_train.head()

Unnamed: 0_level_0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,Booking_date,Adults,Children,Babies,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Reservation_Status,Use_Promotion,Discount_Rate,Room_Rate
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
39428300,F,40,Latino,Grad,<25K,North,City Hotel,7/1/2015,7/2/2015,5/21/2015,2,2,0,BB,No,No,No Deposit,Online,Yes,Check-In,Yes,10,218
77491756,F,49,Latino,Mid-School,50K -- 100K,East,City Hotel,7/1/2015,7/2/2015,5/26/2015,3,3,0,BB,No,No,Refundable,Online,Yes,Check-In,No,0,185
73747291,F,42,caucasian,Grad,<25K,East,City Hotel,7/2/2015,7/6/2015,6/29/2015,3,3,0,BB,No,No,No Deposit,Online,Yes,Check-In,No,0,119
67301739,M,25,African American,College,>100K,South,Airport Hotels,7/2/2015,7/3/2015,6/20/2015,4,3,0,BB,No,No,Refundable,Agent,Yes,Check-In,Yes,5,144
77222321,F,62,Latino,High-School,25K --50K,East,Resort,7/3/2015,7/4/2015,6/20/2015,1,1,0,BB,No,No,No Deposit,Direct,No,Check-In,Yes,10,242


In [8]:
data_train.dtypes

Gender                    object
Age                        int64
Ethnicity                 object
Educational_Level         object
Income                    object
Country_region            object
Hotel_Type                object
Expected_checkin          object
Expected_checkout         object
Booking_date              object
Adults                     int64
Children                   int64
Babies                     int64
Meal_Type                 object
Visted_Previously         object
Previous_Cancellations    object
Deposit_type              object
Booking_channel           object
Required_Car_Parking      object
Reservation_Status        object
Use_Promotion             object
Discount_Rate              int64
Room_Rate                  int64
dtype: object

In [9]:
data_train.describe()

Unnamed: 0,Age,Adults,Children,Babies,Discount_Rate,Room_Rate
count,30248.0,30248.0,30248.0,30248.0,30248.0,30248.0
mean,43.997388,2.333873,1.743586,0.34994,12.494214,175.009786
std,15.294786,1.177353,0.722228,0.572767,11.202236,43.878472
min,18.0,1.0,1.0,0.0,0.0,100.0
25%,31.0,2.0,1.0,0.0,5.0,137.0
50%,44.0,2.0,2.0,0.0,10.0,174.0
75%,57.0,3.0,2.0,1.0,20.0,214.0
max,70.0,5.0,3.0,2.0,40.0,250.0


In [10]:
data_train.isnull().sum()

Gender                    0
Age                       0
Ethnicity                 0
Educational_Level         0
Income                    0
Country_region            0
Hotel_Type                0
Expected_checkin          0
Expected_checkout         0
Booking_date              0
Adults                    0
Children                  0
Babies                    0
Meal_Type                 0
Visted_Previously         0
Previous_Cancellations    0
Deposit_type              0
Booking_channel           0
Required_Car_Parking      0
Reservation_Status        0
Use_Promotion             0
Discount_Rate             0
Room_Rate                 0
dtype: int64

In [11]:
data_train.columns

Index(['Gender', 'Age', 'Ethnicity', 'Educational_Level', 'Income',
       'Country_region', 'Hotel_Type', 'Expected_checkin', 'Expected_checkout',
       'Booking_date', 'Adults', 'Children', 'Babies', 'Meal_Type',
       'Visted_Previously', 'Previous_Cancellations', 'Deposit_type',
       'Booking_channel', 'Required_Car_Parking', 'Reservation_Status',
       'Use_Promotion', 'Discount_Rate', 'Room_Rate'],
      dtype='object')

In [12]:
object_cols = ['Gender', 'Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Reservation_Status', 'Use_Promotion']
# these columns need to be onehot encode or vectorise

dates = ['Expected_checkin', 'Expected_checkout', 'Booking_date'] #need to feature engineer

In [13]:
for col in object_cols:
  print(col, pd.unique(data_train[col]))

Gender ['F' 'M']
Ethnicity ['Latino' 'caucasian' 'African American' 'Asian American']
Educational_Level ['Grad' 'Mid-School' 'College' 'High-School']
Income ['<25K' '50K -- 100K' '>100K' '25K --50K']
Country_region ['North' 'East' 'South' 'West']
Hotel_Type ['City Hotel' 'Airport Hotels' 'Resort']
Meal_Type ['BB' 'FB' 'HB']
Visted_Previously ['No' 'Yes']
Previous_Cancellations ['No' 'Yes']
Deposit_type ['No Deposit' 'Refundable' 'Non-Refundable']
Booking_channel ['Online' 'Agent' 'Direct']
Required_Car_Parking ['Yes' 'No']
Reservation_Status ['Check-In' 'Canceled' 'No-Show']
Use_Promotion ['Yes' 'No']


In [14]:
one_hot_encoded_lst = ['Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 
       'Meal_Type', 'Deposit_type', 'Booking_channel'] 

In [15]:
data_train = pd.get_dummies(data_train, columns=one_hot_encoded_lst)

In [16]:
data_train.columns

Index(['Gender', 'Age', 'Expected_checkin', 'Expected_checkout',
       'Booking_date', 'Adults', 'Children', 'Babies', 'Visted_Previously',
       'Previous_Cancellations', 'Required_Car_Parking', 'Reservation_Status',
       'Use_Promotion', 'Discount_Rate', 'Room_Rate',
       'Ethnicity_African American', 'Ethnicity_Asian American',
       'Ethnicity_Latino', 'Ethnicity_caucasian', 'Educational_Level_College',
       'Educational_Level_Grad', 'Educational_Level_High-School',
       'Educational_Level_Mid-School', 'Income_25K --50K',
       'Income_50K -- 100K', 'Income_<25K', 'Income_>100K',
       'Country_region_East', 'Country_region_North', 'Country_region_South',
       'Country_region_West', 'Hotel_Type_Airport Hotels',
       'Hotel_Type_City Hotel', 'Hotel_Type_Resort', 'Meal_Type_BB',
       'Meal_Type_FB', 'Meal_Type_HB', 'Deposit_type_No Deposit',
       'Deposit_type_Non-Refundable', 'Deposit_type_Refundable',
       'Booking_channel_Agent', 'Booking_channel_Direct',
  

In [17]:
data_train['Gender'] = data_train['Gender'].map({'F':0, 'M':1})
data_train['Visted_Previously'] = data_train['Visted_Previously'].map({'No':0, 'Yes':1})
data_train['Previous_Cancellations'] = data_train['Previous_Cancellations'].map({'No':0, 'Yes':1})
data_train['Required_Car_Parking'] = data_train['Required_Car_Parking'].map({'Yes':1, 'No':0})
data_train['Use_Promotion'] = data_train['Use_Promotion'].map({'Yes':1, 'No':0})
data_train['Reservation_Status'] = data_train['Reservation_Status'].map({'Check-In':1, 'Canceled':2, 'No-Show':3})

In [18]:
data_train[dates[0]] = pd.to_datetime(data_train[dates[0]])
data_train[dates[1]] = pd.to_datetime(data_train[dates[1]])
data_train[dates[2]] = pd.to_datetime(data_train[dates[2]])

In [19]:
data_train['Expected_stay'] = (data_train[dates[1]] - data_train[dates[0]]).dt.days
data_train['Expected_stay'].unique()

array([1, 4, 3, 2])

In [20]:
data_train['Booking_to_checkingin'] = (data_train[dates[0]] - data_train[dates[2]]).dt.days

In [21]:
data_train['Month_of_stay'] = data_train[dates[0]].dt.month

In [22]:
weekdayin = data_train[dates[0]].dt.dayofweek
weekdayout = data_train[dates[1]].dt.dayofweek
from pandas import DataFrame

fina = []
for x,y in zip(weekdayin, weekdayout):
  t = []
  if y >= x:
    for i in range(x, y + 1):
      t.append(i)
    if 5 in t or 6 in t:
      fina.append(1)
    else:
      fina.append(0)
  else:
    for i in range(x, 7):
      t.append(i)
    for j in range(0, y + 1):
      t.append(i)
    if 5 in t or 6 in t:
      fina.append(1)
    else:
      fina.append(0)
data_train['weekend_stay'] = DataFrame(fina,columns=['weekend_stay'])['weekend_stay'].values

In [23]:
data_train.isnull().sum()

Gender                           0
Age                              0
Expected_checkin                 0
Expected_checkout                0
Booking_date                     0
Adults                           0
Children                         0
Babies                           0
Visted_Previously                0
Previous_Cancellations           0
Required_Car_Parking             0
Reservation_Status               0
Use_Promotion                    0
Discount_Rate                    0
Room_Rate                        0
Ethnicity_African American       0
Ethnicity_Asian American         0
Ethnicity_Latino                 0
Ethnicity_caucasian              0
Educational_Level_College        0
Educational_Level_Grad           0
Educational_Level_High-School    0
Educational_Level_Mid-School     0
Income_25K --50K                 0
Income_50K -- 100K               0
Income_<25K                      0
Income_>100K                     0
Country_region_East              0
Country_region_North

In [25]:
data_train.head()

Unnamed: 0_level_0,Gender,Age,Expected_checkin,Expected_checkout,Booking_date,Adults,Children,Babies,Visted_Previously,Previous_Cancellations,Required_Car_Parking,Reservation_Status,Use_Promotion,Discount_Rate,Room_Rate,Ethnicity_African American,Ethnicity_Asian American,Ethnicity_Latino,Ethnicity_caucasian,Educational_Level_College,Educational_Level_Grad,Educational_Level_High-School,Educational_Level_Mid-School,Income_25K --50K,Income_50K -- 100K,Income_<25K,Income_>100K,Country_region_East,Country_region_North,Country_region_South,Country_region_West,Hotel_Type_Airport Hotels,Hotel_Type_City Hotel,Hotel_Type_Resort,Meal_Type_BB,Meal_Type_FB,Meal_Type_HB,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Expected_stay,Booking_to_checkingin,Month_of_stay,weekend_stay
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
39428300,0,40,2015-07-01,2015-07-02,2015-05-21,2,2,0,0,0,1,1,1,10,218,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,1,41,7,0
77491756,0,49,2015-07-01,2015-07-02,2015-05-26,3,3,0,0,0,1,1,0,0,185,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,36,7,0
73747291,0,42,2015-07-02,2015-07-06,2015-06-29,3,3,0,0,0,1,1,0,0,119,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,4,3,7,1
67301739,1,25,2015-07-02,2015-07-03,2015-06-20,4,3,0,0,0,1,1,1,5,144,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,12,7,0
77222321,0,62,2015-07-03,2015-07-04,2015-06-20,1,1,0,0,0,0,1,1,10,242,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,13,7,1


In [26]:
def find_correlation(data, threshold=0.85): #removing correlated rows > 0.85
    corr_mat = data.corr()
    corr_mat.loc[:, :] = np.tril(corr_mat, k=-1)
    already_in = set()
    result = []
    for col in corr_mat:
        perfect_corr = corr_mat[col][abs(corr_mat[col])> threshold].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            result.append(perfect_corr)
    select_nested = [f[1:] for f in result]
    select_flat = [i for j in select_nested for i in j]
    return select_flat

In [27]:
find_correlation(data_train)

[]

In [None]:
 data_train = data_train.drop(dates, 1)

In [46]:
from sklearn.utils import shuffle
data_train = shuffle(data_train)

In [47]:
x = data_train.drop('Reservation_Status', 1)
y = data_train['Reservation_Status']

In [48]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .5)

In [49]:
from sklearn.linear_model import LogisticRegression #trying different models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
#from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [50]:
classifiers = [['Logistic Regression :', LogisticRegression(max_iter = 1500)],
       ['Decision Tree Classification :', DecisionTreeClassifier()],
       ['Gradient Boosting Classification :', GradientBoostingClassifier()],
       ['Ada Boosting Classification :', AdaBoostClassifier()],
       ['RandomForest Classification :', AdaBoostClassifier()],
       ['Extra Tree Classification :', ExtraTreesClassifier()],
       ['K-Neighbors Classification :', KNeighborsClassifier()],
       ['Support Vector Classification :',SVC()],
       ['Gaussian Naive Bayes :',GaussianNB()]]

cla_pred = []

for name,model in classifiers:
    model = model
    model.fit(x_train,y_train)
    predictions = model.predict(x_test)
    cla_pred.append(accuracy_score(y_test,predictions))
    print(name,accuracy_score(y_test,predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression : 0.7571409679978841
Decision Tree Classification : 0.6012298333774134
Gradient Boosting Classification : 0.7562814070351759
Ada Boosting Classification : 0.7571409679978841
RandomForest Classification : 0.7571409679978841
Extra Tree Classification : 0.7542978048135414
K-Neighbors Classification : 0.727122454377149


KeyboardInterrupt: ignored

In [89]:
model = DecisionTreeClassifier()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
print(accuracy_score(y_test,predictions))

0.5925619834710744


In [54]:
x_test.shape , predictions.tolist().count(1), predictions.tolist().count(2), predictions.tolist().count(3)

((6050, 43), 6010, 36, 4)

In [38]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp37-none-any.whl size=11687 sha256=1992e35fbb6e1707cc49a0b1947b45cff4878c002dd4885fb290f3fd86806530
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [41]:
#LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def lgbm_evaluate(**params):
#     warnings.simplefilter('ignore')
    
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
        
    clf = LGBMClassifier(**params, n_estimators=20000, nthread=-1)

    test_pred_proba = np.zeros((x_train.shape[0], 3))
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(x_train, y_train)):
        X_train_bo, X_valid = x_train.iloc[train_idx], x_train.iloc[valid_idx]
        y_train_bo, y_valid = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        
        model = LGBMClassifier(**params, n_estimators = 10000, n_jobs = -1)
        model.fit(X_train_bo, y_train_bo, 
                eval_set=[(X_train_bo, y_train_bo), (X_valid, y_valid)], eval_metric='binary_logloss',
                verbose=False, early_stopping_rounds=200)

        y_pred_valid = model.predict_proba(X_valid)

        test_pred_proba[valid_idx] = y_pred_valid

    return accuracy_score(y_valid, y_pred_valid.argmax(1))

In [None]:
#hyper parameter tuning
params = {'colsample_bytree': (0.6, 1),
     'learning_rate': (.001, .08), 
      'num_leaves': (8, 124), 
      'subsample': (0.6, 1), 
      'max_depth': (3, 25), 
      'reg_alpha': (.05, 15.0), 
      'reg_lambda': (.05, 15.0), 
      'min_split_gain': (.001, .03),
      'min_child_weight': (12, 80)}

from sklearn.model_selection import StratifiedKFold
n_fold = 20
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=11)

from bayes_opt import BayesianOptimization
bo = BayesianOptimization(lgbm_evaluate, params)
bo.maximize(init_points=5, n_iter=5)

In [55]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten

In [56]:
model = Sequential()

model.add(Flatten())

model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dense(250))
model.add(Dropout(0.1))
model.add(Activation('relu'))
model.add(Dense(50))
model.add(Activation('relu'))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [57]:
model.fit( x_train, y_train,
                    epochs = 10,
                    batch_size = 20,
                    validation_data = (x_test, y_test))
model.summary()

Epoch 1/10


InvalidArgumentError: ignored

In [62]:
predictions = model.predict([x_test])

In [63]:
predictions = list(np.argmax(predictions[x]) for x in range(len(predictions)))

In [64]:
predictions.count(1), predictions.count(2), predictions.count(3)

(2750, 0, 0)

**Submission**

In [43]:
def prepare_dataset(df, one_hot_encoded_lst, dates):
  df = pd.get_dummies(df, columns=one_hot_encoded_lst)

  df['Gender'] = df['Gender'].map({'F':0, 'M':1})
  df['Visted_Previously'] = df['Visted_Previously'].map({'No':0, 'Yes':1})
  df['Previous_Cancellations'] = df['Previous_Cancellations'].map({'No':0, 'Yes':1})
  df['Required_Car_Parking'] = df['Required_Car_Parking'].map({'Yes':1, 'No':0})
  df['Use_Promotion'] = df['Use_Promotion'].map({'Yes':1, 'No':0})

  df[dates[0]] = pd.to_datetime(df[dates[0]])
  df[dates[1]] = pd.to_datetime(df[dates[1]])
  df[dates[2]] = pd.to_datetime(df[dates[2]])

  df['Expected_stay'] = (df[dates[1]] - df[dates[0]]).dt.days
  df['Booking_to_checkingin'] = (df[dates[0]] - df[dates[2]]).dt.days
  df['Month_of_stay'] = df[dates[0]].dt.month

  weekdayin = df[dates[0]].dt.dayofweek
  weekdayout = df[dates[1]].dt.dayofweek
  from pandas import DataFrame

  fina = []
  for x,y in zip(weekdayin, weekdayout):
    t = []
    if y >= x:
      for i in range(x, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
    else:
      for i in range(x, 7):
        t.append(i)
      for j in range(0, y + 1):
        t.append(i)
      if 5 in t or 6 in t:
        fina.append(1)
      else:
        fina.append(0)
  df['weekend_stay'] = DataFrame(fina,columns=['weekend_stay'])['weekend_stay'].values
  df = df.drop(dates, 1)

  return df

In [90]:
data_test = pd.read_csv('Hotel-A-test.csv', index_col=0)

In [91]:
data_test.head()

Unnamed: 0_level_0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,Booking_date,Adults,Children,Babies,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Use_Promotion,Discount_Rate,Room_Rate
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
62931593,F,52,Latino,Grad,25K --50K,South,City Hotel,11/18/2016,11/19/2016,10/28/2016,3,3,0,HB,No,No,No Deposit,Direct,Yes,Yes,10,153
70586099,F,47,Latino,Grad,25K --50K,East,Airport Hotels,11/18/2016,11/19/2016,8/6/2016,2,1,0,FB,No,No,No Deposit,Online,No,No,0,210
4230648,F,28,Asian American,Grad,<25K,East,City Hotel,4/28/2017,5/1/2017,4/8/2017,2,2,0,BB,No,No,No Deposit,Agent,No,Yes,5,117
25192322,F,65,caucasian,High-School,25K --50K,South,Airport Hotels,11/18/2016,11/20/2016,5/20/2016,1,3,2,FB,No,No,No Deposit,Online,Yes,Yes,10,107
80931528,M,45,African American,College,25K --50K,South,City Hotel,11/18/2016,11/20/2016,10/31/2016,3,1,0,BB,No,No,Refundable,Agent,No,No,0,119


In [92]:
data_test = prepare_dataset(data_test, one_hot_encoded_lst, dates)

In [93]:
data_test.head()

Unnamed: 0_level_0,Gender,Age,Adults,Children,Babies,Visted_Previously,Previous_Cancellations,Required_Car_Parking,Use_Promotion,Discount_Rate,Room_Rate,Ethnicity_African American,Ethnicity_Asian American,Ethnicity_Latino,Ethnicity_caucasian,Educational_Level_College,Educational_Level_Grad,Educational_Level_High-School,Educational_Level_Mid-School,Income_25K --50K,Income_50K -- 100K,Income_<25K,Income_>100K,Country_region_East,Country_region_North,Country_region_South,Country_region_West,Hotel_Type_Airport Hotels,Hotel_Type_City Hotel,Hotel_Type_Resort,Meal_Type_BB,Meal_Type_FB,Meal_Type_HB,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Expected_stay,Booking_to_checkingin,Month_of_stay,weekend_stay
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
62931593,0,52,3,3,0,0,0,1,1,10,153,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,0,1,21,11,1
70586099,0,47,2,1,0,0,0,0,0,0,210,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,104,11,1
4230648,0,28,2,2,0,0,0,0,1,5,117,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,3,20,4,1
25192322,0,65,1,3,2,0,0,1,1,10,107,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,2,182,11,1
80931528,1,45,3,1,0,0,0,0,0,0,119,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,2,18,11,1


In [94]:
test_predictions = model.predict(data_test)

In [95]:
test_predictions = test_predictions.tolist()
test_predictions.count(1), test_predictions.count(2), test_predictions.count(3)

(2831, 969, 518)

In [96]:
print(test_predictions)

[2, 1, 1, 2, 1, 3, 1, 3, 1, 2, 2, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 2, 3, 1, 3, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 2, 1, 3, 2, 3, 1, 3, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 3, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, 1, 1, 3, 1, 1, 1, 2, 1, 2, 2, 1, 1, 3, 1, 1, 1, 2, 1, 2, 3, 1, 2, 1, 1, 3, 3, 1, 2, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 2, 3, 1, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 2, 3, 2, 1, 2, 1, 2, 1, 3, 1, 2, 1, 2, 1, 1, 3, 1, 1, 2, 2, 3, 1, 1, 1, 3, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 3, 1, 3, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 1, 1, 3, 3, 2, 2, 

In [97]:
col_drop = data_test.columns.tolist()

In [98]:
submission = data_test.drop(col_drop, 1)

In [99]:
submission['Reservation_status'] = DataFrame(test_predictions,columns=['Reservation_status'])['Reservation_status'].values

In [100]:
submission.head()

Unnamed: 0_level_0,Reservation_status
Reservation-id,Unnamed: 1_level_1
62931593,2
70586099,1
4230648,1
25192322,2
80931528,1


In [101]:
submission.to_csv('submission2-day1.csv')