<a href="https://colab.research.google.com/github/swilsonmfc/pandas/blob/main/8_Solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hotel Bookings

![](https://cdn.inprnt.com/thumbs/5b/fa/5bfa6a8ea6c461d1c21c0a44962968ed.jpg?response-cache-control=max-age=2628000)

# Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error

import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import statsmodels.graphics.regressionplots as regplot
from statsmodels.stats.outliers_influence import variance_inflation_factor

  import pandas.util.testing as tm


# Data
* https://www.kaggle.com/jessemostipak/hotel-booking-demand

In [2]:
!wget -O hotel.zip https://github.com/swilsonmfc/pandas/blob/main/hotel.zip?raw=true

--2021-07-23 19:07:31--  https://github.com/swilsonmfc/pandas/blob/main/hotel.zip?raw=true
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/swilsonmfc/pandas/raw/main/hotel.zip [following]
--2021-07-23 19:07:31--  https://github.com/swilsonmfc/pandas/raw/main/hotel.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/swilsonmfc/pandas/main/hotel.zip [following]
--2021-07-23 19:07:31--  https://raw.githubusercontent.com/swilsonmfc/pandas/main/hotel.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1308365 (1.2M) [ap

In [3]:
!unzip -o hotel.zip

Archive:  hotel.zip
  inflating: hotel_bookings.csv      


In [4]:
hotel_df = pd.read_csv('./hotel_bookings.csv')

In [5]:
hotel_df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,0.0,0,BB,BEL,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,0.0,0,BB,FRA,Online TA,TA/TO,0,0,0,E,E,0,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


## Types

In [6]:
hotel_df.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

## Missing
* Missing data here seems reasonable for the most part
* 4 missing children - Impute to median

In [7]:
hotel_df.isna().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [8]:
# Series where method vs np.where
hotel_df['children'] = hotel_df.children.where(hotel_df.children.notna(), 0)
hotel_df['country']  = hotel_df.country.fillna('UNKNOWN')
hotel_df['agent']    = hotel_df.agent.fillna('NONE')
hotel_df['company']  = hotel_df.company.fillna('NONE')

In [9]:
hotel_df.isna().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
agent                             0
company                           0
days_in_waiting_list              0
customer_type                     0
adr                         

## Outliers
* More than 10 people booked in a room
  * All cancelled
  * Agent 96 is present on 1/2 of them
* The hotel paid one person to stay there
  * Best to truncate that to $0
  * One room cost \$5,400

In [10]:
hotel_df = hotel_df[(hotel_df.adr >= 0) & (hotel_df.adr < 1000)]

# Logistic Regression

In [11]:
f = 'is_canceled ~ hotel + arrival_date_month + deposit_type'
model = smf.logit(formula=f, data=hotel_df)
fitted = model.fit()
print(fitted.summary())

Optimization terminated successfully.
         Current function value: 0.523298
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:            is_canceled   No. Observations:               119388
Model:                          Logit   Df Residuals:                   119373
Method:                           MLE   Df Model:                           14
Date:                Fri, 23 Jul 2021   Pseudo R-squ.:                  0.2061
Time:                        19:07:36   Log-Likelihood:                -62475.
converged:                       True   LL-Null:                       -78698.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                          -0.7494      0.023    -32.483  

In [12]:
f = """is_canceled ~ + C(arrival_date_month, Treatment(reference='January'))
             + C(hotel, Treatment(reference='Resort Hotel')) 
             + C(deposit_type)""" 
model = smf.logit(formula=f, data=hotel_df)
fitted = model.fit()
print(fitted.summary())

Optimization terminated successfully.
         Current function value: 0.523298
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:            is_canceled   No. Observations:               119388
Model:                          Logit   Df Residuals:                   119373
Method:                           MLE   Df Model:                           14
Date:                Fri, 23 Jul 2021   Pseudo R-squ.:                  0.2061
Time:                        19:07:38   Log-Likelihood:                -62475.
converged:                       True   LL-Null:                       -78698.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                                         coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------------------------------

In [13]:
# Generate a single prediction
pred_df = pd.DataFrame([{'hotel': 'City Hotel', 'arrival_date_month': 'July', 'deposit_type': 'Refundable'}])
proba = fitted.predict(pred_df)
pred  = np.where(proba < .5, 0, 1) # Round works here too!
pred

array([0])

In [14]:
# Generate a prediction for a resort hotel in October with non-refundable ticket (Non Refund)
pred_df = pd.DataFrame([{'hotel': 'Resort Hotel', 'arrival_date_month': 'October', 'deposit_type': 'Non Refund'}])
proba = fitted.predict(pred_df)
pred  = np.where(proba < .5, 0, 1) # Round works here too!
pred

array([1])

In [15]:
# Predict over the entire dataset
proba = fitted.predict(hotel_df)
pred  = np.where(proba < .5, 0, 1) # Round works here too!
pred

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
# Produce a confusion matrix
confusion_matrix(hotel_df.is_canceled, pred)

array([[75072,    93],
       [29730, 14493]])

In [17]:
# Print the classification report
print(classification_report(hotel_df.is_canceled, pred))

              precision    recall  f1-score   support

           0       0.72      1.00      0.83     75165
           1       0.99      0.33      0.49     44223

    accuracy                           0.75    119388
   macro avg       0.85      0.66      0.66    119388
weighted avg       0.82      0.75      0.71    119388



# Sklearn Logistic Regression

In [18]:
# Generate a dataframe of is_canceled, lead_time, adr & deposit type
# Target variable is is_canceled
X = hotel_df[['is_canceled', 'lead_time', 'adr', 'deposit_type']]

In [19]:
# Filter the dataset to Portugal (country == PRT)
X = X[hotel_df.country == 'PRT']
y = X.pop('is_canceled')

In [20]:
# Preprocess deposit type as one hot encoded
X = pd.get_dummies(X, columns=['deposit_type'])

In [21]:
# Train - test split 
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y)

In [22]:
# Scale the lead time & adr
scaler_lead_time = StandardScaler()
X_train['lead_time'] = scaler_lead_time.fit_transform(X_train[['lead_time']])
X_test['lead_time']  = scaler_lead_time.transform(X_test[['lead_time']])

scaler_adr = StandardScaler()
X_train['adr'] = scaler_adr.fit_transform(X_train[['adr']])
X_test['adr']  = scaler_adr.transform(X_test[['adr']])

In [23]:
# Compute class weights for balancing
factor = len(X_train) / sum(y)
weights = {0: 1, 1: factor}
weights

{0: 1, 1: 1.3242604840468057}

In [24]:
# Construct a Linear Regression Object & Fit to the training data
logit = LogisticRegression(class_weight=weights)
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight={0: 1, 1: 1.3242604840468057},
                   dual=False, fit_intercept=True, intercept_scaling=1,
                   l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)

In [25]:
# Predict on the test dataset
pred = logit.predict(X_test)

In [26]:
# Print the classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.70      0.80      0.75      5230
           1       0.83      0.74      0.78      6918

    accuracy                           0.76     12148
   macro avg       0.76      0.77      0.76     12148
weighted avg       0.77      0.76      0.77     12148



# Sklearn Linear Regression

In [27]:
# Generate a dataframe of adr, lead_time, and country
# Target variable is adr
X = hotel_df[['adr', 'lead_time', 'country']]

In [28]:
# Preprocess country as one hot encoded
# Note:  add the parameter drop_first=True to get_dummies
X = pd.get_dummies(X, columns=['country'], drop_first=True)
y = X.pop('adr')

In [29]:
# Train - test split 
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y)

In [30]:
# Construct a Linear Regression Object & Fit to the training data
lin = LinearRegression()
lin.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
# Predict on the test dataset
pred = lin.predict(X_test)

In [32]:
# Compute the mean absolute error 
mean_absolute_error(y_test, pred)

35.25548471385818