In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Hotel Reservations.csv')
df

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.00,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.00,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.00,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.50,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,INN36271,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.80,1,Not_Canceled
36271,INN36272,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,Canceled
36272,INN36273,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,Not_Canceled
36273,INN36274,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.50,0,Canceled


# Exploratory Data Analysis

In [3]:
df.groupby('no_of_adults')['Booking_ID'].agg(['count'])

Unnamed: 0_level_0,count
no_of_adults,Unnamed: 1_level_1
0,139
1,7695
2,26108
3,2317
4,16


In [4]:
df.groupby('no_of_children')['Booking_ID'].agg(['count'])

Unnamed: 0_level_0,count
no_of_children,Unnamed: 1_level_1
0,33577
1,1618
2,1058
3,19
9,2
10,1


In [5]:
def children(x):
    if x==0:
        return 'None'
    elif x==1:
        return 'One'
    elif x==2:
        return 'Two'
    else:
        return 'More than two'

In [6]:
df['children'] = df['no_of_children'].apply(children)

In [7]:
df.groupby('no_of_weekend_nights')['Booking_ID'].agg(['count'])

Unnamed: 0_level_0,count
no_of_weekend_nights,Unnamed: 1_level_1
0,16872
1,9995
2,9071
3,153
4,129
5,34
6,20
7,1


In [8]:
def weekend_nights(x):
    if x==0:
        return 'None'
    elif x==1:
        return 'One'
    elif x==2:
        return 'Two'
    elif x==3 or x==4:
        return 'Three/Four'
    else:
        return 'More than Four'

In [9]:
df['weekend_nights'] = df['no_of_weekend_nights'].apply(weekend_nights)

In [10]:
df.groupby('no_of_week_nights')['Booking_ID'].agg(['count'])

Unnamed: 0_level_0,count
no_of_week_nights,Unnamed: 1_level_1
0,2387
1,9488
2,11444
3,7839
4,2990
5,1614
6,189
7,113
8,62
9,34


In [11]:
def week_nights(x):
    if x==0:
        return 'None'
    elif x==1:
        return 'One'
    elif x==2:
        return 'Two'
    elif x==3:
        return 'Three'
    elif x==4:
        return 'Four'
    elif x==5:
        return 'Five'
    elif x==6:
        return 'Six'
    elif x>6 and x<11:
        return 'Seven to Ten'
    elif x>10 and x<15:
        return 'Eleven to Fourteen'
    else:
        return 'Fifteen to Seventeen'

In [12]:
df['week_nights'] = df['no_of_week_nights'].apply(week_nights)

In [13]:
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,...,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,children,weekend_nights,week_nights
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,...,Offline,0,0,0,65.0,0,Not_Canceled,,One,Two
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,...,Online,0,0,0,106.68,1,Not_Canceled,,Two,Three
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,...,Online,0,0,0,60.0,0,Canceled,,Two,One
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,...,Online,0,0,0,100.0,0,Canceled,,,Two
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,...,Online,0,0,0,94.5,0,Canceled,,One,One


In [14]:
df.columns

Index(['Booking_ID', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space',
       'room_type_reserved', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_date', 'market_segment_type', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'avg_price_per_room', 'no_of_special_requests', 'booking_status',
       'children', 'weekend_nights', 'week_nights'],
      dtype='object')

In [15]:
cat_cols = ['no_of_adults','children','weekend_nights','week_nights','type_of_meal_plan','required_car_parking_space','room_type_reserved', 'arrival_year', 'arrival_month',
       'arrival_date', 'market_segment_type', 'repeated_guest',]

In [16]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

In [17]:
df.dtypes

Booking_ID                                object
no_of_adults                            category
no_of_children                             int64
no_of_weekend_nights                       int64
no_of_week_nights                          int64
type_of_meal_plan                       category
required_car_parking_space              category
room_type_reserved                      category
lead_time                                  int64
arrival_year                            category
arrival_month                           category
arrival_date                            category
market_segment_type                     category
repeated_guest                          category
no_of_previous_cancellations               int64
no_of_previous_bookings_not_canceled       int64
avg_price_per_room                       float64
no_of_special_requests                     int64
booking_status                            object
children                                category
weekend_nights      

In [18]:
df['booking_status'] = df['booking_status'].astype('category')

In [19]:
y = df['booking_status'].cat.codes

In [37]:
y = torch.Tensor(y)

In [38]:
y[:5]

tensor([1, 1, 0, 0, 0], dtype=torch.int8)

In [22]:
df = df.drop(['Booking_ID','no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights','booking_status'],axis=1)

In [23]:
df.head()

Unnamed: 0,no_of_adults,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,children,weekend_nights,week_nights
0,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,,One,Two
1,2,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,,Two,Three
2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,,Two,One
3,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,,,Two
4,2,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,,One,One


In [24]:
df['type_of_meal_plan'] = df['type_of_meal_plan'].cat.codes
df['room_type_reserved'] = df['room_type_reserved'].cat.codes
df['market_segment_type'] = df['market_segment_type'].cat.codes
df['children'] = df['children'].cat.codes
df['weekend_nights'] = df['weekend_nights'].cat.codes
df['week_nights'] = df['week_nights'].cat.codes

In [25]:
df

Unnamed: 0,no_of_adults,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,children,weekend_nights,week_nights
0,2,0,0,0,224,2017,10,2,3,0,0,0,65.00,0,1,2,9
1,2,3,0,0,5,2018,11,6,4,0,0,0,106.68,1,1,4,8
2,1,0,0,0,1,2018,2,28,4,0,0,0,60.00,0,1,4,5
3,2,0,0,0,211,2018,5,20,4,0,0,0,100.00,0,1,1,9
4,2,3,0,0,48,2018,4,11,4,0,0,0,94.50,0,1,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3,0,0,3,85,2018,8,3,4,0,0,0,167.80,1,1,4,7
36271,2,0,0,0,228,2018,10,17,4,0,0,0,90.95,2,1,2,8
36272,2,0,0,0,148,2018,7,1,4,0,0,0,98.39,2,1,4,7
36273,2,3,0,0,63,2018,4,21,4,0,0,0,94.50,0,1,1,8


In [26]:
df['no_of_adults'] = df['no_of_adults'].cat.codes
df['required_car_parking_space'] = df['required_car_parking_space'].cat.codes
df['arrival_year'] = df['arrival_year'].cat.codes
df['arrival_date'] = df['arrival_date'].cat.codes
df['arrival_month'] = df['arrival_month'].cat.codes
df['repeated_guest'] = df['repeated_guest'].cat.codes

In [41]:
x = df.values
y = y.numpy()

In [42]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)

In [43]:
class Model(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(17,350)
        self.fc2 = nn.Linear(350, 100)
        self.fc3 = nn.Linear(100,2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        
        return x

In [45]:
model = Model()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [47]:
epochs=300
batches=5
losses=[]
for b in range(batches):
    for i in range(epochs):
        y_pred = model.forward(X_train)
        loss = criterion(y_pred, y_train)
        losses.append(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'batch: {b:2}  loss: {losses[-1].item():10.8f}')

batch:  0  loss: 0.64097941
batch:  1  loss: 0.64097941
batch:  2  loss: 0.64097941
batch:  3  loss: 0.64097941
batch:  4  loss: 0.64097941


In [51]:
test_data = torch.FloatTensor(x)

In [55]:
correct = 0
predictions = []
with torch.no_grad():
    for i,data in enumerate(test_data):
        y_val = model.forward(data)
#         print(f'{i+1:2}. {str(y_val.argmax().item()):38}  {y[i]}')
        predictions.append(y_val.argmax().item())
        if y_val.argmax().item() == y[i]:
            correct += 1
print(f'\n{correct} out of {len(y)} = {100*correct/len(y):.2f}% correct')


24390 out of 36275 = 67.24% correct
