In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier

In [3]:
train = pd.read_csv('F:\Data Storm 2.0\Dataset\Hotel-A-train.csv')
train.head()

Unnamed: 0,Reservation-id,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,...,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Reservation_Status,Use_Promotion,Discount_Rate,Room_Rate
0,39428300,F,40,Latino,Grad,<25K,North,City Hotel,7/1/2015,7/2/2015,...,BB,No,No,No Deposit,Online,Yes,Check-In,Yes,10,218
1,77491756,F,49,Latino,Mid-School,50K -- 100K,East,City Hotel,7/1/2015,7/2/2015,...,BB,No,No,Refundable,Online,Yes,Check-In,No,0,185
2,73747291,F,42,caucasian,Grad,<25K,East,City Hotel,7/2/2015,7/6/2015,...,BB,No,No,No Deposit,Online,Yes,Check-In,No,0,119
3,67301739,M,25,African American,College,>100K,South,Airport Hotels,7/2/2015,7/3/2015,...,BB,No,No,Refundable,Agent,Yes,Check-In,Yes,5,144
4,77222321,F,62,Latino,High-School,25K --50K,East,Resort,7/3/2015,7/4/2015,...,BB,No,No,No Deposit,Direct,No,Check-In,Yes,10,242


In [4]:
validate = pd.read_csv('F:\Data Storm 2.0\Dataset\Hotel-A-validation.csv')
validate.head()

Unnamed: 0,Reservation-id,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,...,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Reservation_Status,Use_Promotion,Discount_Rate,Room_Rate
0,45716350,M,56,caucasian,Grad,<25K,West,Resort,8/31/2016,9/2/2016,...,HB,No,No,No Deposit,Agent,No,No-Show,Yes,15,192
1,88857401,M,60,Latino,College,25K --50K,West,Resort,8/31/2016,9/4/2016,...,FB,Yes,No,No Deposit,Online,Yes,Canceled,No,0,187
2,16074440,F,58,Asian American,College,<25K,North,Airport Hotels,9/1/2016,9/2/2016,...,FB,No,No,No Deposit,Direct,No,Canceled,Yes,10,227
3,10992124,F,23,Latino,College,25K --50K,East,Airport Hotels,8/31/2016,9/2/2016,...,FB,Yes,No,Refundable,Direct,No,Check-In,Yes,25,189
4,15934351,F,47,Asian American,College,25K --50K,South,City Hotel,8/31/2016,9/1/2016,...,HB,Yes,No,No Deposit,Online,Yes,Check-In,Yes,10,218


In [5]:
train['Reservation_Status'].replace('Check-In', 1,inplace=True)
train['Reservation_Status'].replace('Canceled', 2,inplace=True)
train['Reservation_Status'].replace('No-Show', 3,inplace=True)

validate['Reservation_Status'].replace('Check-In', 1,inplace=True)
validate['Reservation_Status'].replace('Canceled', 2,inplace=True)
validate['Reservation_Status'].replace('No-Show', 3,inplace=True)

In [6]:
train['Income'].replace('<25K', 1,inplace=True)
train['Income'].replace('25K --50K', 2,inplace=True)
train['Income'].replace('50K -- 100K', 3,inplace=True)
train['Income'].replace('>100K', 4,inplace=True)

validate['Income'].replace('<25K', 1,inplace=True)
validate['Income'].replace('25K --50K', 2,inplace=True)
validate['Income'].replace('50K -- 100K', 3,inplace=True)
validate['Income'].replace('>100K', 4,inplace=True)

In [7]:
train['Meal_Type'].replace('BB', 1,inplace=True)
train['Meal_Type'].replace('HB', 2,inplace=True)
train['Meal_Type'].replace('FB', 3,inplace=True)

validate['Meal_Type'].replace('BB', 1,inplace=True)
validate['Meal_Type'].replace('HB', 2,inplace=True)
validate['Meal_Type'].replace('FB', 3,inplace=True)

### Feature Engineering

In [8]:
train['checkin_month'] = pd.DatetimeIndex(train['Expected_checkin']).month
train['checkout_month'] = pd.DatetimeIndex(train['Expected_checkout']).month
train['booking_month'] = pd.DatetimeIndex(train['Booking_date']).month

In [9]:
validate['checkin_month'] = pd.DatetimeIndex(validate['Expected_checkin']).month
validate['checkout_month'] = pd.DatetimeIndex(validate['Expected_checkout']).month
validate['booking_month'] = pd.DatetimeIndex(validate['Booking_date']).month

In [10]:
train['Expected_checkin'] = pd.to_datetime(train['Expected_checkin'])
validate['Expected_checkin'] = pd.to_datetime(validate['Expected_checkin'])

train['Expected_checkout'] = pd.to_datetime(train['Expected_checkout'])
validate['Expected_checkout'] = pd.to_datetime(validate['Expected_checkout'])

train['Booking_date'] = pd.to_datetime(train['Booking_date'])
validate['Booking_date'] = pd.to_datetime(validate['Booking_date'])

In [11]:
train['staying_days'] = (train['Expected_checkout']-train['Expected_checkin']).dt.days
validate['staying_days'] = (validate['Expected_checkout']-validate['Expected_checkin']).dt.days

In [12]:
train['Dependants']=train['Adults']+train['Children']+train['Babies']
validate['Dependants'] = validate['Adults']+validate['Children']+validate['Babies']

In [13]:

train['Dependants_without_babies']=train['Adults']+train['Children']
validate['Dependants_without_babies'] = validate['Adults']+validate['Children']

In [14]:
train['days_until_check_in']=(train['Expected_checkin']-train['Booking_date']).dt.days
validate['days_until_check_in']=(validate['Expected_checkin']-validate['Booking_date']).dt.days

## Scaling

In [15]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
train[["days_until_check_in"]] = min_max_scaler.fit_transform(train[["days_until_check_in"]])
train[["staying_days"]] = min_max_scaler.fit_transform(train[["staying_days"]])

validate[["days_until_check_in"]] = min_max_scaler.fit_transform(validate[["days_until_check_in"]])
validate[["staying_days"]] = min_max_scaler.fit_transform(validate[["staying_days"]])
#print(train["days_until_check_in"])

In [16]:
#Copies of originals
train_original=train.copy()
validate_original=validate.copy()
train.head()

Unnamed: 0,Reservation-id,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,...,Use_Promotion,Discount_Rate,Room_Rate,checkin_month,checkout_month,booking_month,staying_days,Dependants,Dependants_without_babies,days_until_check_in
0,39428300,F,40,Latino,Grad,1,North,City Hotel,2015-07-01,2015-07-02,...,Yes,10,218,7,7,5,0.0,4,4,0.063202
1,77491756,F,49,Latino,Mid-School,3,East,City Hotel,2015-07-01,2015-07-02,...,No,0,185,7,7,5,0.0,6,6,0.05618
2,73747291,F,42,caucasian,Grad,1,East,City Hotel,2015-07-02,2015-07-06,...,No,0,119,7,7,6,1.0,6,6,0.009831
3,67301739,M,25,African American,College,4,South,Airport Hotels,2015-07-02,2015-07-03,...,Yes,5,144,7,7,6,0.0,7,7,0.022472
4,77222321,F,62,Latino,High-School,2,East,Resort,2015-07-03,2015-07-04,...,Yes,10,242,7,7,6,0.0,2,2,0.023876


In [17]:
train.drop(['Expected_checkin', 'Expected_checkout','Booking_date', 'Adults', 'Children', 'Babies'], axis='columns', inplace=True)
validate.drop(['Expected_checkin', 'Expected_checkout','Booking_date', 'Adults', 'Children', 'Babies'], axis='columns', inplace=True)

In [40]:
train.head()

Unnamed: 0,Reservation-id,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Meal_Type,Visted_Previously,...,Use_Promotion,Discount_Rate,Room_Rate,checkin_month,checkout_month,booking_month,staying_days,Dependants,Dependants_without_babies,days_until_check_in
0,39428300,F,40,Latino,Grad,1,North,City Hotel,1,No,...,Yes,10,218,7,7,5,0.0,4,4,0.063202
1,77491756,F,49,Latino,Mid-School,3,East,City Hotel,1,No,...,No,0,185,7,7,5,0.0,6,6,0.05618
2,73747291,F,42,caucasian,Grad,1,East,City Hotel,1,No,...,No,0,119,7,7,6,1.0,6,6,0.009831
3,67301739,M,25,African American,College,4,South,Airport Hotels,1,No,...,Yes,5,144,7,7,6,0.0,7,7,0.022472
4,77222321,F,62,Latino,High-School,2,East,Resort,1,No,...,Yes,10,242,7,7,6,0.0,2,2,0.023876


## Modeling

In [18]:
x_t= train.drop('Reservation_Status',1)
y_t= train.Reservation_Status

In [19]:
x_v = validate.drop('Reservation_Status',1)
y_v= validate.Reservation_Status

In [20]:
x_t = pd.get_dummies(x_t)
x_v = pd.get_dummies(x_v)
x_t.head()

Unnamed: 0,Reservation-id,Age,Income,Meal_Type,Discount_Rate,Room_Rate,checkin_month,checkout_month,booking_month,staying_days,...,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Required_Car_Parking_No,Required_Car_Parking_Yes,Use_Promotion_No,Use_Promotion_Yes
0,39428300,40,1,1,10,218,7,7,5,0.0,...,1,0,0,0,0,1,0,1,0,1
1,77491756,49,3,1,0,185,7,7,5,0.0,...,0,0,1,0,0,1,0,1,1,0
2,73747291,42,1,1,0,119,7,7,6,1.0,...,1,0,0,0,0,1,0,1,1,0
3,67301739,25,4,1,5,144,7,7,6,0.0,...,0,0,1,1,0,0,0,1,0,1
4,77222321,62,2,1,10,242,7,7,6,0.0,...,1,0,0,0,1,0,1,0,0,1


In [21]:
model = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1)

In [22]:
model.fit(x_t, y_t)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [23]:
pred_v = model.predict(x_v)
validate_original['Predicted']=pred_v

In [24]:
df = pd.DataFrame() 

# append columns to an empty DataFrame 
df['Reservation-id'] = validate['Reservation-id']
df['actual'] = validate['Reservation_Status']
df['Reservation_Status_Pred'] =pred_v

In [25]:
df.head()

Unnamed: 0,Reservation-id,actual,Reservation_Status_Pred
0,45716350,3,1
1,88857401,2,1
2,16074440,2,1
3,10992124,1,1
4,15934351,1,1


In [26]:
df['Reservation_Status_Pred'].unique()

array([1, 3, 2], dtype=int64)

In [27]:
accuracy_score(y_v,pred_v)

0.5853037468170244

## Prediction

In [28]:
test=pd.read_csv('F:\Data Storm 2.0\Dataset\Hotel-A-test.csv')

In [29]:
test['Meal_Type'].replace('BB', 1,inplace=True)
test['Meal_Type'].replace('HB', 2,inplace=True)
test['Meal_Type'].replace('FB', 3,inplace=True)

In [30]:
test['checkin_month'] = pd.DatetimeIndex(test['Expected_checkin']).month
test['checkout_month']=pd.DatetimeIndex(test['Expected_checkout']).month
test['booking_month']=pd.DatetimeIndex(test['Booking_date']).month

In [31]:
test['Expected_checkin'] = pd.to_datetime(test['Expected_checkin'])
test['Expected_checkout'] = pd.to_datetime(test['Expected_checkout'])
test['Booking_date'] = pd.to_datetime(test['Booking_date'])

In [32]:
test['days_until_check_in']=(test['Expected_checkin']-test['Booking_date']).dt.days

In [33]:
test['staying_days'] = (test['Expected_checkout']-test['Expected_checkin']).dt.days

In [34]:
test['Dependants']=test['Adults']+test['Children']+test['Babies']

In [35]:
test['Dependants_without_babies']=test['Adults']+test['Children']

In [36]:
min_max_scaler = MinMaxScaler()
test[["days_until_check_in"]] = min_max_scaler.fit_transform(test[["days_until_check_in"]])
test[["staying_days"]] = min_max_scaler.fit_transform(test[["staying_days"]])

In [37]:
test.drop(['Expected_checkin', 'Expected_checkout','Booking_date', 'Adults', 'Children', 'Babies'], axis='columns', inplace=True)

In [41]:
test.head()

Unnamed: 0,Reservation-id,Age,Meal_Type,Discount_Rate,Room_Rate,checkin_month,checkout_month,booking_month,days_until_check_in,staying_days,...,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Required_Car_Parking_No,Required_Car_Parking_Yes,Use_Promotion_No,Use_Promotion_Yes
0,62931593,52,2,10,153,11,11,10,0.110619,0.0,...,1,0,0,0,1,0,0,1,0,1
1,70586099,47,3,0,210,11,11,8,0.477876,0.0,...,1,0,0,0,0,1,1,0,1,0
2,4230648,28,1,5,117,4,5,4,0.106195,0.666667,...,1,0,0,1,0,0,1,0,0,1
3,25192322,65,3,10,107,11,11,5,0.823009,0.333333,...,1,0,0,0,0,1,0,1,0,1
4,80931528,45,1,0,119,11,11,10,0.097345,0.333333,...,0,0,1,1,0,0,1,0,1,0


In [42]:
test = pd.get_dummies(test)

In [43]:
pred_test = model.predict(test)

ValueError: Number of features of the model must match the input. Model n_features is 44 and input n_features is 47 

In [None]:
submission=pd.DataFrame()
submission['Reservation-id']=test_original['Reservation-id']
submission['Reservation_Status']=pred_test

In [None]:
print(set(list(pred_test)))

In [None]:
#submission.to_csv('Submission_1.csv',index=False)