In [54]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import FunctionTransformer

hotel_data = pd.read_csv("hotel.csv", low_memory=False)

In [55]:
hotel_data.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [56]:
hotel_data.shape

(119390, 32)

In [57]:
# Dropped this feature because we have already the date separated in other features 
hotel_data.drop('reservation_status_date', inplace=True, axis=1)
hotel_data.drop('reservation_status', inplace=True, axis=1)

In [58]:
# Convert "children" column to numeric, replacing non-numeric values with NaN
hotel_data['children'] = pd.to_numeric(hotel_data['children'], errors='coerce')

# Calculate the mean of the numeric values in the "children" column
mean_children = hotel_data['children'].mean()

# Replace "na" strings with the mean
hotel_data['children'].fillna(mean_children, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hotel_data['children'].fillna(mean_children, inplace=True)


In [59]:
# check if it worked and there is no "na"
hotel_data['children'].unique()

array([ 0.       ,  1.       ,  2.       , 10.       ,  3.       ,
        0.1038899])

In [60]:
hotel_data.fillna(0, inplace=True)

In [61]:
hotel_data.isna().any().any()

False

In [62]:
categorical_features = ['customer_type',
                       'deposit_type', 'assigned_room_type', 'reserved_room_type', 'distribution_channel',
                       'market_segment', 'country', 'meal', 'hotel', 'arrival_date_month']
for key in categorical_features:
    hotel_data[key] = hotel_data[key].astype(str)
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(dtype='int',drop='first'))
])

In [63]:
numeric_features = ['total_of_special_requests', 'required_car_parking_spaces', 'adr',
                   'days_in_waiting_list', 'booking_changes', 'previous_bookings_not_canceled',
                   'previous_cancellations', 'babies', 'children', 'adults', 'stays_in_week_nights',
                   'stays_in_weekend_nights', 'arrival_date_day_of_month', 'arrival_date_week_number',
                   'lead_time']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [64]:
# scale, encode and skip columns with 1/0 values
preprocessor = ColumnTransformer(
    transformers=[
        ('encode', categorical_transformer, categorical_features),
        ('scale', numeric_transformer, numeric_features),
        ('skip', 'passthrough', ['is_repeated_guest', 'is_canceled'])
    ])

In [65]:
# Apply preprocessing to the dataset
processed_data = preprocessor.fit_transform(hotel_data)


In [66]:
# convert the processed data back to DF
hotel_data = pd.DataFrame.sparse.from_spmatrix(processed_data, columns=preprocessor.get_feature_names_out())
print(hotel_data[hotel_data.columns[1:]].corr()['skip__is_canceled'][:].to_string())

encode__customer_type_Transient           0.133084
encode__customer_type_Transient-Party    -0.124135
encode__deposit_type_Non Refund           0.481457
encode__deposit_type_Refundable          -0.011312
encode__assigned_room_type_B             -0.037610
encode__assigned_room_type_C             -0.053877
encode__assigned_room_type_D             -0.128036
encode__assigned_room_type_E             -0.064794
encode__assigned_room_type_F             -0.045978
encode__assigned_room_type_G             -0.019864
encode__assigned_room_type_H             -0.002869
encode__assigned_room_type_I             -0.040784
encode__assigned_room_type_K             -0.032813
encode__assigned_room_type_L              0.003773
encode__assigned_room_type_P              0.013071
encode__reserved_room_type_B             -0.008306
encode__reserved_room_type_C             -0.007337
encode__reserved_room_type_D             -0.047702
encode__reserved_room_type_E             -0.038634
encode__reserved_room_type_F   

In [67]:
# split to features / label
y = hotel_data['skip__is_canceled']
X = hotel_data.drop('skip__is_canceled', axis=1)

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(107451, 245) (11939, 245) (107451,) (11939,)


In [69]:
from sklearn.linear_model import LogisticRegression
import sys
np.set_printoptions(threshold=sys.maxsize)
# instantiate the model (using the default parameters)
logreg = LogisticRegression(max_iter=1000)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [70]:
scores = cross_val_score(logreg, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())

Cross-Validation Scores: [0.71023536 0.72929056 0.7464193  0.71949074 0.74507915]
Mean Accuracy: 0.7301030237038278
Standard Deviation: 0.01413159397744032


In [71]:
from sklearn.metrics import classification_report
target_names = ['no_cancel', 'yes_cancel']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   no_cancel       0.83      0.91      0.87      7522
  yes_cancel       0.82      0.67      0.74      4417

    accuracy                           0.82     11939
   macro avg       0.82      0.79      0.80     11939
weighted avg       0.82      0.82      0.82     11939

