# Data Preparation - for features 17 - 24

In [None]:
# import library used for data management
import numpy as np 
import pandas as pd 

In [None]:
# load datasets
original = pd.read_csv('Sharon_hotel_bookings.csv')
label = pd.read_csv('label_hotel_bookings.csv')

In [None]:
# to view the whole data set
original

In [None]:
original.info()
# 'is_repeated_guest' should be categorical instead of numeric
# 16,340 missing values found in 'agent'
# 'agent' should be categorical instead of numeric

In [None]:
# change 'is_repeated_guest' to categorical
original['is_repeated_guest']=original['is_repeated_guest'].astype(str)
original.info()

In [None]:
# handle missing values in 'agent'
# NaN in 'agent' means the booking is not made by a travel agency
# replace NaN with 0 for the ease of data handling
original['agent'].fillna(0,inplace=True)
original['agent']

In [None]:
# change 'agent' to categorical
original['agent']=original['agent'].astype(str)
original.info()

In [None]:
original

In [None]:
# get dummy variables for categorical variable 'deposit_type'
DepositTypeDummy =pd.get_dummies(original['deposit_type'],prefix='deposit_type')

In [None]:
# Adding the dummy variables to the data frame
original = pd.concat([original,DepositTypeDummy],axis=1,sort=True)
original

In [None]:
# 'deposit_type_Refundable' could be expressed as not 'deposit_type_No Deposit' and not 'deposit_type_Non Refund'
original = original.drop(columns=['deposit_type','deposit_type_Refundable'])
original

In [None]:
# Hypothesis: If reserved room type matches assigned room type, i.e. reserved_assigned_room_type_match = 1, it is more likely for the customer to NOT cancel the booking.
# check to see if reserved room type matches assigned room type
original['reserved_assigned_room_type_match'] = (original['reserved_room_type'] == original['assigned_room_type'])
# convert to result to 0 and 1, categorical
original['reserved_assigned_room_type_match'] = original['reserved_assigned_room_type_match'].astype(int)
original['reserved_assigned_room_type_match'] = original['reserved_assigned_room_type_match'].astype(str)
# 'reserved_room_type' and'assigned_room_type' expressed by 'reserved_assigned_room_type_match'
original = original.drop(columns=['reserved_room_type','assigned_room_type'])
original

In [None]:
original.info()

In [None]:
PrepData = pd.concat([original,label],axis=1,sort=True)
PrepData

In [None]:
PrepData.to_csv("April9_processeddata.csv")

# Decision Tree & Cross Validation by GridSearchCV

In [None]:
# Define features and target variable
features = ['is_repeated_guest','previous_cancellations','previous_bookings_not_canceled','booking_changes','agent','deposit_type_No Deposit','deposit_type_Non Refund', 'reserved_assigned_room_type_match']
target = ['is_canceled']
X = PrepData [features]
y = PrepData [target]

In [None]:
#import functions for data split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state = 42)

In [None]:
# import DecisionTreeClassifier used for tree building
from sklearn.tree import DecisionTreeClassifier
# import GridSearchCV for cross validation
from sklearn.model_selection import GridSearchCV