In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
full_data=pd.read_csv('C:/Users/vedan/OneDrive/Desktop/hotel_bookings.csv')

In [3]:
cancel_corr=full_data.corr()['is_canceled']
cancel_corr.abs().sort_values(ascending=False)[1:]

lead_time                         0.293123
total_of_special_requests         0.234658
required_car_parking_spaces       0.195498
booking_changes                   0.144381
previous_cancellations            0.110133
is_repeated_guest                 0.084793
agent                             0.083114
adults                            0.060017
previous_bookings_not_canceled    0.057358
days_in_waiting_list              0.054186
adr                               0.047557
babies                            0.032491
stays_in_week_nights              0.024765
company                           0.020642
arrival_date_year                 0.016660
arrival_date_week_number          0.008148
arrival_date_day_of_month         0.006130
children                          0.005048
stays_in_weekend_nights           0.001791
Name: is_canceled, dtype: float64

In [4]:
full_data.groupby('is_canceled')['reservation_status'].value_counts()

is_canceled  reservation_status
0            Check-Out             75166
1            Canceled              43017
             No-Show                1207
Name: reservation_status, dtype: int64

In [5]:
num_features = ["lead_time","arrival_date_week_number","arrival_date_day_of_month",
                "stays_in_weekend_nights","stays_in_week_nights","adults","children",
                "babies","is_repeated_guest", "previous_cancellations",
                "previous_bookings_not_canceled","agent","company",
                "required_car_parking_spaces", "total_of_special_requests", "adr"]

cat_features = ["hotel","arrival_date_month","meal","market_segment",
                "distribution_channel","reserved_room_type","deposit_type","customer_type"]

In [6]:
full_data[num_features]

Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,agent,company,required_car_parking_spaces,total_of_special_requests,adr
0,342,27,1,0,0,2,0.0,0,0,0,0,,,0,0,0.00
1,737,27,1,0,0,2,0.0,0,0,0,0,,,0,0,0.00
2,7,27,1,0,1,1,0.0,0,0,0,0,,,0,0,75.00
3,13,27,1,0,1,1,0.0,0,0,0,0,304.0,,0,0,75.00
4,14,27,1,0,2,2,0.0,0,0,0,0,240.0,,0,1,98.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,23,35,30,2,5,2,0.0,0,0,0,0,394.0,,0,0,96.14
119386,102,35,31,2,5,3,0.0,0,0,0,0,9.0,,0,2,225.43
119387,34,35,31,2,5,2,0.0,0,0,0,0,9.0,,0,4,157.71
119388,109,35,31,2,5,2,0.0,0,0,0,0,89.0,,0,0,104.40


In [7]:
features=num_features+cat_features

In [8]:
X=full_data.drop(columns='is_canceled',axis=1)[features]
y=full_data['is_canceled']

In [9]:
from sklearn.impute import SimpleImputer
num_transformer=SimpleImputer(strategy='constant')

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
cat_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='Unknown')),('onehot',OneHotEncoder(handle_unknown='ignore'))])
preprocessor=ColumnTransformer(transformers=[('num',num_transformer,num_features),('cat',cat_transformer,cat_features)])

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import KFold,cross_val_score

In [12]:
base_models=[('DT_model',DecisionTreeClassifier(random_state=42)),
             ('RF_model',RandomForestClassifier(random_state=42,n_jobs=-1)),
             ('LR_model',LogisticRegression(random_state=42,n_jobs=-1)),
             ('XGB_model',XGBClassifier(random_state=42,n_jobs=-1))]
kfolds=10
split=KFold(n_splits=kfolds,shuffle=True,random_state=42)


In [13]:
for name,model in base_models:
    model_steps=Pipeline(steps=[('preprocessor',preprocessor),('model',model)])
    cv_results=cross_val_score(model_steps,X,y,cv=split,scoring='accuracy',n_jobs=-1)
    min_score=round(np.min(cv_results),4)
    max_score=round(np.max(cv_results),4)
    mean_score=round(np.mean(cv_results),4)
    std_dev=round(np.std(cv_results),4)
    print(f"{name} cross validation accuarcy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

DT_model cross validation accuarcy score: 0.8289 +/- 0.002 (std) min: 0.8244, max: 0.8311
RF_model cross validation accuarcy score: 0.8695 +/- 0.0019 (std) min: 0.8657, max: 0.8724
LR_model cross validation accuarcy score: 0.795 +/- 0.0027 (std) min: 0.7914, max: 0.7991
XGB_model cross validation accuarcy score: 0.848 +/- 0.0024 (std) min: 0.8443, max: 0.851
