In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn. model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
#setting up the dataframe
hotel_reservations = pd.read_csv(r"C:\Cellula\first project\first inten project.csv")
hotel_reservations.columns = hotel_reservations.columns.str.replace(' ', '_')
hotel_reservations.columns = hotel_reservations.columns.str.replace('price_', 'price')
#the date 2018-2-29 does not exist on the calendar so it was replaced with 2018-2-28
hotel_reservations['date_of_reservation'] = hotel_reservations['date_of_reservation'].str.replace('2018-2-29', '2/28/2018') 
hotel_reservations['date_of_reservation'] = pd.to_datetime(hotel_reservations['date_of_reservation'])
hotel_reservations['total_nights'] = hotel_reservations.iloc[:, 3:5].sum(axis= 1)
hotel_reservations['total_individuals'] = hotel_reservations.iloc[:, 1:3].sum(axis=1)
hotel_reservations['reservation_month'] = hotel_reservations['date_of_reservation'].dt.month
hotel_reservations['reservation_season'] = 'winter'
hotel_reservations.loc[hotel_reservations['reservation_month'].between(3,5), 'reservation_season'] = 'spring'
hotel_reservations.loc[hotel_reservations['reservation_month'].between(6,8), 'reservation_season'] = 'summer'
hotel_reservations.loc[hotel_reservations['reservation_month'].between(9,11), 'reservation_season'] = 'autumn'

In [3]:
#replacing lead time and average price outliers with min/max since they have alot of ouliers
for col in hotel_reservations[['lead_time', 'average_price']]:
    q1 = np.percentile(hotel_reservations[col], 25, method= 'midpoint')
    q3 = np.percentile(hotel_reservations[col], 75, method= 'midpoint')
    iqr = q3-q1
    if col == 'lead_time': #max and min are rounded as lead time data type is int64
        max = round(q3 + 1.5*iqr)
        min = round(q1 - 1.5*iqr)
    else:
        max = q3 + 1.5*iqr
        min = q1 - 1.5*iqr
    hotel_reservations.loc[hotel_reservations[col] > max, col] = max
    hotel_reservations.loc[hotel_reservations[col] < min, col] = min

In [4]:
#dropping outliers for number of nights and special requests since they have a few outliers
for col in hotel_reservations[['number_of_week_nights', 'number_of_weekend_nights', 'total_nights', 'special_requests']]:
    q1 = np.percentile(hotel_reservations[col], 25, method= 'midpoint')
    q3 = np.percentile(hotel_reservations[col], 75, method= 'midpoint')
    iqr = q3-q1
    max = round(q3 + 1.5*iqr)
    min = round(q1 - 1.5*iqr)
    upper_indices = hotel_reservations[hotel_reservations[col] > max].index
    lower_indices = hotel_reservations[hotel_reservations[col] < min].index
    hotel_reservations.drop(index= upper_indices, inplace = True)
    hotel_reservations.drop(index= lower_indices, inplace = True)

In [5]:
#only 22 reservations have above 2 children, 16 reservations have above 3 adults and 18 reservations have above 4 total individuals so these reservations will be dropped
for col in hotel_reservations[['number_of_children', 'number_of_adults', 'total_individuals']]:
    if col == 'number_of_children':
        outlier_index = hotel_reservations[hotel_reservations[col] > 2].index
        hotel_reservations.drop(index= outlier_index, inplace= True)
    elif col == 'number_of_adults':
        outlier_index = hotel_reservations[hotel_reservations[col] > 3].index
        hotel_reservations.drop(index= outlier_index, inplace= True)
    else:
        outlier_index = hotel_reservations[hotel_reservations[col] > 4].index
        hotel_reservations.drop(index= outlier_index, inplace= True)


In [6]:
#checking for nulls
hotel_reservations.reset_index(drop= True, inplace= True)
hotel_reservations.isnull().sum()

Booking_ID                  0
number_of_adults            0
number_of_children          0
number_of_weekend_nights    0
number_of_week_nights       0
type_of_meal                0
car_parking_space           0
room_type                   0
lead_time                   0
market_segment_type         0
repeated                    0
P-C                         0
P-not-C                     0
average_price               0
special_requests            0
date_of_reservation         0
booking_status              0
total_nights                0
total_individuals           0
reservation_month           0
reservation_season          0
dtype: int64

In [7]:
#dropping costing features
hotel_reservations.drop(['Booking_ID', 'date_of_reservation', 'reservation_month', 'room_type', 'type_of_meal'], axis= 1, inplace= True)

In [8]:
#booking status label encoding
label_encoder= preprocessing.LabelEncoder()
hotel_reservations['booking_status'] = label_encoder.fit_transform(hotel_reservations['booking_status'])

In [9]:
#one-hot encoding of categorical features
hotel_reservations.rename(columns={'market_segment_type' : 'mst', 'reservation_season' : 'rs'}, inplace= True)
onehotencoder = preprocessing.OneHotEncoder(sparse_output= False)
ohe = onehotencoder.fit_transform(hotel_reservations[['mst', 'rs']])
ohe_df = pd.DataFrame(ohe, columns= onehotencoder.get_feature_names_out(['mst', 'rs']))
ohe_df.columns = [x.lower() for x in ohe_df.columns]
hs_encoded = pd.concat([hotel_reservations, ohe_df], axis= 1)
hs_encoded.drop(['mst', 'rs'], axis= 1, inplace= True)


In [10]:
#normalizing non-encoded features
scaler = preprocessing.Normalizer()
non_scaled_feat = ['number_of_adults', 'number_of_children', 'number_of_weekend_nights', 
                    'number_of_week_nights', 'lead_time', 'P-C', 'P-not-C', 'average_price', 
                    'special_requests', 'total_nights', 'total_individuals']
scaled_data = scaler.fit_transform(hs_encoded[non_scaled_feat])
scaled_df = pd.DataFrame(scaled_data, columns= hs_encoded[non_scaled_feat].columns)
hs_encoded.drop(non_scaled_feat, axis= 1, inplace= True)
hs_scaled_encoded = pd.concat([hs_encoded, scaled_df], axis= 1)

In [11]:
#finding features that have a weak to moderate correlation with booking status
hs_corr = hs_scaled_encoded.corr()
postive_corr = hs_corr['booking_status'].loc[hs_corr['booking_status'].between(0.1, 0.99)].index.tolist()
print('positively correlated features:', postive_corr)
negative_corr = hs_corr['booking_status'].loc[hs_corr['booking_status'] < -0.1].index.tolist()
print('negatively correlated features:', negative_corr)
corr = postive_corr + negative_corr

positively correlated features: ['repeated', 'mst_corporate', 'rs_winter', 'number_of_adults', 'number_of_week_nights', 'average_price', 'special_requests', 'total_nights', 'total_individuals']
negatively correlated features: ['mst_online', 'rs_summer', 'lead_time']


In [12]:
#setting target variable
hs_target = hs_scaled_encoded['booking_status']
hs_scaled_encoded.drop('booking_status', axis= 1, inplace= True)

In [13]:
#KNN
x_train, x_test, y_train, y_test = train_test_split(hs_scaled_encoded, hs_target, test_size= 0.2, random_state= 45)
model = KNeighborsClassifier(n_neighbors= 7)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print('testing score =', accuracy_score(y_test, y_pred))

testing score = 0.8308991981672395


In [14]:
#multicollinearity analysis
reg_hs = hs_scaled_encoded[corr]
vif_data = pd.DataFrame()
vif_data['feature'] = corr
vif_data['vif'] = [variance_inflation_factor(reg_hs.values, i) for i in range(len(corr))]
vif_data

Unnamed: 0,feature,vif
0,repeated,1.30171
1,mst_corporate,1.519431
2,rs_winter,1.33897
3,number_of_adults,35.322412
4,number_of_week_nights,11.176169
5,average_price,9.184286
6,special_requests,1.869273
7,total_nights,13.57862
8,total_individuals,34.012298
9,mst_online,3.597909


#### Number of adults, number of week nights, total nights and total individuals have high multicollinearity

In [15]:
#logistic regression
x_train, x_test, y_train, y_test = train_test_split(reg_hs, hs_target, test_size= 0.2, random_state= 22)
classifier = LogisticRegression(random_state= 75)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.49      0.56      2328
           1       0.77      0.87      0.82      4656

    accuracy                           0.74      6984
   macro avg       0.72      0.68      0.69      6984
weighted avg       0.73      0.74      0.73      6984



In [16]:
#imbalanced data ratio
bs_count = hs_target.value_counts()
bs_ratio = bs_count[1] / bs_count[0]
bs_ratio

2.0207612456747404

In [17]:
#weighted logistic regression
weights = {0: bs_ratio, 1: 1}
weighted_classifier = LogisticRegression(random_state= 36, class_weight= weights, max_iter=1000)
weighted_classifier.fit(x_train, y_train)
y_pred_weighted = weighted_classifier.predict(x_test)
print(classification_report(y_test, y_pred_weighted))

              precision    recall  f1-score   support

           0       0.58      0.71      0.64      2328
           1       0.84      0.74      0.79      4656

    accuracy                           0.73      6984
   macro avg       0.71      0.73      0.71      6984
weighted avg       0.75      0.73      0.74      6984

