# Predicting Booking Cancellations: Hotel Reservations

## Importing Libraries

In [254]:
import pandas as pd 
import pycountry as pc
import pycountry_convert as pcc

## Loading and Exploring the Dataset

In [255]:
# Read csv file and create a DataFrame
df = pd.read_csv('dataset/hotel_bookings_extra_columns.csv')

In [256]:
# Inspect the dataset shaoe
print(f'Number of rows: {df.shape[0]} \nNumber of columns: {df.shape[1]}')

Number of rows: 119390 
Number of columns: 36


In [257]:
pd.set_option('display.max_columns', None)

In [258]:
# Show the first 5 rows of the DataFrame
df.head(20)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,Linda Hines,LHines@verizon.com,713-226-5883,************5498
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03,Jasmine Fletcher,JFletcher43@xfinity.com,190-271-6743,************9263
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,0,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03,Dylan Rangel,Rangel.Dylan@comcast.net,420-332-5209,************6994
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,0.0,0,FB,PRT,Direct,Direct,0,0,0,C,C,0,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03,William Velez,Velez_William@mail.com,286-669-4333,************8729
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06,Steven Murphy,Steven.Murphy54@aol.com,341-726-5787,************3639
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,D,D,0,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22,Michael Moore,MichaelMoore81@outlook.com,316-648-6176,************9190


In [259]:
# Show information about columns and datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [260]:
# Inspect every feature and add categorical attributes to a list
cat_col = ['hotel','arrival_date_year','arrival_date_month','meal','country','market_segment','distribution_channel','reserved_room_type',
          'assigned_room_type','deposit_type','agent','company','customer_type','reservation_status']

In [261]:
# Add boolean features to a list
bool_col = ['is_canceled','is_repeated_guest']

In [262]:
# Convert every categorical attribute's datatype to 'object'
for i in cat_col:
    df[i] = df[i].astype(object)

In [263]:
# Convert boolean attribute's datatype to 'bool'
for i in bool_col:
    df[i] = df[i].astype(bool)

In [264]:
# Show summary statistics for numerical values, transposed for better readability
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lead_time,119390.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
arrival_date_week_number,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,119390.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,119390.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
stays_in_week_nights,119390.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
adults,119390.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
children,119386.0,0.10389,0.398561,0.0,0.0,0.0,0.0,10.0
babies,119390.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0
previous_cancellations,119390.0,0.087118,0.844336,0.0,0.0,0.0,0.0,26.0
previous_bookings_not_canceled,119390.0,0.137097,1.497437,0.0,0.0,0.0,0.0,72.0


In [265]:
# Show summary statistics of categorical columns, transposed
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
hotel,119390.0,2.0,City Hotel,79330.0
arrival_date_year,119390.0,3.0,2016,56707.0
arrival_date_month,119390.0,12.0,August,13877.0
meal,119390.0,5.0,BB,92310.0
country,118902.0,177.0,PRT,48590.0
market_segment,119390.0,8.0,Online TA,56477.0
distribution_channel,119390.0,5.0,TA/TO,97870.0
reserved_room_type,119390.0,10.0,A,85994.0
assigned_room_type,119390.0,12.0,A,74053.0
deposit_type,119390.0,3.0,No Deposit,104641.0


In [266]:
# Show fields that contain null values and the number of null values
null_sum = df.isna().sum()
null_sum = null_sum[null_sum>0]
null_sum

children         4
country        488
agent        16340
company     112593
dtype: int64

In [267]:
# Check if there are duplicated rows in the dataset
df.duplicated().sum()

0

## Data Preprocessing

### Handling Null Values

In [268]:
# Calculate and display the rate of null values to the number of rows in the dataset for each column that contains null values
null_sum / df.shape[0] * 100

children     0.003350
country      0.408744
agent       13.686238
company     94.306893
dtype: float64

Since 'children' and 'country' columns' rate is so low, the rows containing null values in these 2 columns will be dropped.

In [269]:
# Drop any column that has null value in these 2 columns
df = df.dropna(subset = ['children','country'])

Due to their high rate of null values and irrelevancy to the target value, remaining 2 features will be dropped completely.

In [270]:
# Drop 'agent' and 'company' features
df = df.drop(labels = ['agent','company'], axis=1)

In [271]:
# Check to see if there are any null values remained
df.isna().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests   

### Feature Selection

In the dataset, 4 features include personal information about customers, which are not relevant to our target variable. Those 4 features will be dropped.

In [272]:
# Drop columns that contain personal information
df = df.drop(['name','email','phone-number','credit_card'],axis=1)

There are 2 attributes that give information about the target variable 'is_canceled' which causes data leakage. To prevent this from happening, following attributes will be dropped:
- reservation_status
- reservation_status_date

In [273]:
# Drop the columns that cause data leakage
df = df.drop(['reservation_status','reservation_status_date'],axis=1)

Checking the unique values in each feature is useful to detect any attributes with high cardinality and determining if a new feature can be created.

In [274]:
# Show the number of unique values of categorical features
df.select_dtypes(include='object').nunique().sort_values(ascending=False)

country                 177
arrival_date_month       12
assigned_room_type       12
reserved_room_type       10
market_segment            7
meal                      5
distribution_channel      5
customer_type             4
arrival_date_year         3
deposit_type              3
hotel                     2
dtype: int64

The 'country' feature has high-cardinality. The best practice is to group it into a 'continent' feature to reduce the cardinality.

#### Creating 'continent' Feature

For creating 'continent' feature, 'pycountry' and 'pycountry_convert' libraries will be used.  
Our dataset contains 3-digit country codes (ISO 3166-1 alpha-3) as the 'country' feature. A new feature using 2-digit country codes (ISO 3166-1 alpha-2) needs to be created.

In [275]:
# Some adjustments to correct irregularities in the dataset
# Convert 'CN' values to 'CHN' which is country code of China
df.loc[df['country'] == 'CN', 'country'] = 'CHN'

# Drop columns with the country code 'TMP', which is not a standard code defined in ISO 3166
df = df.query('country not in ("TMP")')

In [276]:
# Create 'country_a2' feature, 2-digit country codes
df['country_a2'] = list(map(lambda x: pcc.country_alpha3_to_country_alpha2(x), df['country']))

In [277]:
# Show the last 5 rows to check the new column visually
df[['country','country_a2']].tail()

Unnamed: 0,country,country_a2
119385,BEL,BE
119386,FRA,FR
119387,DEU,DE
119388,GBR,GB
119389,DEU,DE


In [278]:
# Drop columns with the country codes 'UM','AQ' and 'TF' which are not standard codes
df = df.query('country_a2 not in ("UM","AQ","TF")')

In [279]:
# Create 'continent' feature using 'pycountry_convert' library
df['continent'] = list(map(lambda x: pcc.country_alpha2_to_continent_code(x), df['country_a2']))

In [280]:
# Show the last 5 rows
df[['country','country_a2','continent']].tail()

Unnamed: 0,country,country_a2,continent
119385,BEL,BE,EU
119386,FRA,FR,EU
119387,DEU,DE,EU
119388,GBR,GB,EU
119389,DEU,DE,EU


In [281]:
# Display the number of bookings for each continent
df['continent'].value_counts()

continent
EU    107822
AS      4468
SA      2708
NA      2274
AF      1113
OC       506
Name: count, dtype: int64

By grouping the countries into their respective continents, the cardinality of the feature is decreased.  
This will prevent the model from overfitting to our training dataset and enhance its ability to generalize to unseen data.

In [282]:
df = df.drop(['country','country_a2'],axis = 1)

#### Creating 'is_room_type_matched' Feature

Sometimes the assigned room type differs from the reserved room type due to hotel operation reasons (e.g. overbooking) or by customer request.  
This feature is added to reflect whether the customer is assigned to their requested room type.

In [283]:
# Add the new boolean feature based on the condition
df['is_room_type_matched'] = df['reserved_room_type'] == df['assigned_room_type']

In [284]:
# Check the dtype of the new feature
df['is_room_type_matched'].dtype

dtype('bool')

In [285]:
# Drop both the features 'reserved_room_type' and 'assigned_room_type'
df = df.drop(['reserved_room_type','assigned_room_type'],axis =1)

#### Creating 'meal_ordinal' Feature

The 'meal' feature includes the meal packages that customers choose while making bookings.  
The meanings of the codes in the dataset are as follows:

Undefined/SC – no meal package\
BB – Bed & Breakfast\
HB – Half board (breakfast and one other meal)\
FB – Full board (breakfast, lunch and dinner)

Thus, an ordinal feature can be derived from this feature with values ranging from 0 to 3, where 0 represents the cheapest option and 3 the most expensive.

In [286]:
# Display the unique values and their number of occurence in the dataset
df['meal'].value_counts()

meal
BB           91858
HB           14434
SC           10636
Undefined     1165
FB             798
Name: count, dtype: int64

In [287]:
# Create a dictionary to map categorical meal options to ordinal values
meal_dict = {'Undefined':0, 'SC':0, 'BB':1, 'HB':2, 'FB':3}

In [288]:
# Use meal_dict dictionary to map categorical values to integers
df['meal_ordinal'] = df['meal'].map(meal_dict)

In [289]:
# Check the dtype
print(df['meal_ordinal'].dtype)

int64


In [290]:
# Drop the meal feature
df = df.drop('meal',axis =1)

#### Creating 'season' Feature

According to Travel+Leisure, a well known travel magazine, Portugal's tourism seasons fall into 3 categories:\
(Note that both hotels included in the dataset are located in Lisbon, Portugal.)

High Season: June through August\
Shoulder Season: September to November; April to June\
Low Season: December to April

In [291]:
# Count and display the number of bookings in each month
df['arrival_date_month'].value_counts()

arrival_date_month
August       13852
July         12627
May          11778
October      11094
April        11045
June         10926
September    10467
March         9739
February      8012
November      6752
December      6726
January       5873
Name: count, dtype: int64

In [292]:
# Create the function to map months to seasons
def month_to_season(month):
    """ Convert the month name to tourism season."""
    
    if month in ['June','July','August']:
        return 'High'
    elif month in ['September','October','November','April','May']:
        return 'Shoulder'
    elif month in ['December','January','February','March']:
        return 'Low'
    else:
        return 'Unidentified'

In [293]:
# Create new feature using the function 'month_to_season'
df['season'] = df['arrival_date_month'].apply(month_to_season)

In [294]:
# Display and count the unique values in 'season' column
df['season'].value_counts()

season
Shoulder    51136
High        37405
Low         30350
Name: count, dtype: int64

In [295]:
# Drop the feature 'arrival_date_month'
df = df.drop('arrival_date_month',axis = 1)

### Outliers

In [296]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lead_time,118891.0,104.314877,106.905191,0.0,18.0,69.0,161.0,737.0
arrival_date_week_number,118891.0,27.166312,13.589783,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,118891.0,15.801019,8.780356,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,118891.0,0.928893,0.996228,0.0,0.0,1.0,2.0,16.0
stays_in_week_nights,118891.0,2.502199,1.900198,0.0,1.0,2.0,3.0,41.0
adults,118891.0,1.858391,0.578572,0.0,2.0,2.0,2.0,55.0
children,118891.0,0.104196,0.399146,0.0,0.0,0.0,0.0,10.0
babies,118891.0,0.007948,0.097383,0.0,0.0,0.0,0.0,10.0
previous_cancellations,118891.0,0.087147,0.845894,0.0,0.0,0.0,0.0,26.0
previous_bookings_not_canceled,118891.0,0.131642,1.484715,0.0,0.0,0.0,0.0,72.0


In [297]:
df[df['adr'] >1000]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,continent,is_room_type_matched,meal_ordinal,season
48515,City Hotel,True,35,2016,13,25,0,1,2,0.0,0,Offline TA/TO,TA/TO,False,0,0,1,Non Refund,0,Transient,5400.0,0,0,EU,True,1,Low


In [298]:
df.shape

(118891, 27)

In [299]:
df = df.drop(df[df['adr']>1000].index)

In [300]:
df.shape

(118890, 27)

In [301]:
df = df.reset_index(drop=True)

### Encoding Categorical Variables

In [302]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
hotel,118890,2,City Hotel,79294
arrival_date_year,118890,3,2016,56431
market_segment,118890,7,Online TA,56396
distribution_channel,118890,5,TA/TO,97723
deposit_type,118890,3,No Deposit,104156
customer_type,118890,4,Transient,89166
continent,118890,6,EU,107821
season,118890,3,Shoulder,51136


### Nominal Features

In [303]:
from sklearn.preprocessing import OneHotEncoder

In [304]:
lstt = ['hotel','arrival_date_year','market_segment','distribution_channel','deposit_type','customer_type','continent','season']

In [305]:
encoder = OneHotEncoder(categories = 'auto',
                       sparse_output = False, drop= 'if_binary')
encoded = encoder.fit(df[['hotel','arrival_date_year','market_segment','distribution_channel','deposit_type','customer_type','continent','season']])

In [306]:
encoded.categories_

[array(['City Hotel', 'Resort Hotel'], dtype=object),
 array([2015, 2016, 2017], dtype=object),
 array(['Aviation', 'Complementary', 'Corporate', 'Direct', 'Groups',
        'Offline TA/TO', 'Online TA'], dtype=object),
 array(['Corporate', 'Direct', 'GDS', 'TA/TO', 'Undefined'], dtype=object),
 array(['No Deposit', 'Non Refund', 'Refundable'], dtype=object),
 array(['Contract', 'Group', 'Transient', 'Transient-Party'], dtype=object),
 array(['AF', 'AS', 'EU', 'NA', 'OC', 'SA'], dtype=object),
 array(['High', 'Low', 'Shoulder'], dtype=object)]

In [307]:
array_encoded = encoded.transform(df[['hotel','arrival_date_year','market_segment','distribution_channel','deposit_type','customer_type','continent','season']])

In [329]:
df_encoded = pd.DataFrame(data = array_encoded, columns = encoded.get_feature_names_out(), dtype = 'bool')

In [330]:
df_encoded.head()

Unnamed: 0,hotel_Resort Hotel,arrival_date_year_2015,arrival_date_year_2016,arrival_date_year_2017,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,continent_AF,continent_AS,continent_EU,continent_NA,continent_OC,continent_SA,season_High,season_Low,season_Shoulder
0,True,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False
1,True,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False
2,True,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False
3,True,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False
4,True,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False


In [331]:
df_encoded.shape

(118890, 32)

In [311]:
df.shape

(118890, 27)

In [333]:
df_merged = df.merge(df_encoded, left_index=True, right_index=True, validate='one_to_one')

In [334]:
df_merged.shape

(118890, 59)

In [335]:
df_merged = df_merged.drop(['hotel','arrival_date_year','market_segment','distribution_channel','deposit_type','customer_type','continent','season'], axis=1)

In [336]:
df_merged.shape

(118890, 51)

In [338]:
import numpy as np

In [339]:
df_merged.describe(include= np.number).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lead_time,118890.0,104.31546,106.905451,0.0,18.0,69.0,161.0,737.0
arrival_date_week_number,118890.0,27.166431,13.589778,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,118890.0,15.800942,8.780353,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,118890.0,0.928901,0.996229,0.0,0.0,1.0,2.0,16.0
stays_in_week_nights,118890.0,2.502212,1.900201,0.0,1.0,2.0,3.0,41.0
adults,118890.0,1.85839,0.578574,0.0,2.0,2.0,2.0,55.0
children,118890.0,0.104197,0.399147,0.0,0.0,0.0,0.0,10.0
babies,118890.0,0.007949,0.097383,0.0,0.0,0.0,0.0,10.0
previous_cancellations,118890.0,0.087148,0.845897,0.0,0.0,0.0,0.0,26.0
previous_bookings_not_canceled,118890.0,0.131643,1.484721,0.0,0.0,0.0,0.0,72.0


In [340]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118890 entries, 0 to 118889
Data columns (total 51 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   is_canceled                     118890 non-null  bool   
 1   lead_time                       118890 non-null  int64  
 2   arrival_date_week_number        118890 non-null  int64  
 3   arrival_date_day_of_month       118890 non-null  int64  
 4   stays_in_weekend_nights         118890 non-null  int64  
 5   stays_in_week_nights            118890 non-null  int64  
 6   adults                          118890 non-null  int64  
 7   children                        118890 non-null  float64
 8   babies                          118890 non-null  int64  
 9   is_repeated_guest               118890 non-null  bool   
 10  previous_cancellations          118890 non-null  int64  
 11  previous_bookings_not_canceled  118890 non-null  int64  
 12  booking_changes 

### Splitting the Dataset

In [341]:
X = df_merged.drop('is_canceled',axis=1)
y = df_merged['is_canceled']

In [342]:
from sklearn.model_selection import train_test_split

In [343]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

In [344]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(95112, 50) (23778, 50) (95112,) (23778,)


#### Scaling the Dataset

In [345]:
from sklearn.preprocessing import MinMaxScaler

In [376]:
scaler = MinMaxScaler()

In [380]:
num_cols = X_train.columns[(X_train.dtypes == 'int64') | (X_train.dtypes == 'float64')].tolist()

In [381]:
num_cols

['lead_time',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests',
 'meal_ordinal']

In [382]:
X_train_scaled = X_train.copy()
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])

In [383]:
X_test_scaled = X_test.copy()
X_test_scaled[num_cols] = scaler.fit_transform(X_test[num_cols])

## Creating the Model

In [385]:
from sklearn.linear_model import LogisticRegression

In [390]:
lr = LogisticRegression(random_state=99, solver = 'sag')
model = lr.fit(X_train_scaled, y_train)

In [393]:
type(model)

sklearn.linear_model._logistic.LogisticRegression

In [394]:
predictions = model.predict(X_test_scaled)

In [399]:
X_test_scaled['prediction'] = predictions
X_test_scaled['label'] = y_test

In [400]:
X_test_scaled.head()

Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,is_room_type_matched,meal_ordinal,hotel_Resort Hotel,arrival_date_year_2015,arrival_date_year_2016,arrival_date_year_2017,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,continent_AF,continent_AS,continent_EU,continent_NA,continent_OC,continent_SA,season_High,season_Low,season_Shoulder,prediction,label
85995,0.074722,0.25,0.0,0.111111,0.083333,0.074074,0.0,0.0,False,0.0,0.0,0.0,0.0,0.209055,0.0,0.0,True,0.333333,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,True,False
21939,0.022258,0.211538,0.6,0.222222,0.125,0.074074,0.0,0.0,False,0.0,0.0,0.058824,0.0,0.114173,0.0,0.0,True,0.0,True,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False
107362,0.073132,0.192308,0.466667,0.0,0.166667,0.074074,0.0,0.0,False,0.0,0.0,0.0,0.0,0.247638,0.0,0.4,False,0.333333,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False
109430,0.165342,0.269231,0.433333,0.222222,0.125,0.074074,0.0,0.0,False,0.0,0.0,0.0,0.0,0.281693,0.0,0.2,True,0.333333,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,True,False
97475,0.0,0.730769,0.7,0.0,0.041667,0.037037,0.0,0.0,False,0.0,0.0,0.0,0.0,0.374016,0.0,0.4,True,0.333333,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False


In [401]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

In [403]:
f1_score(X_test_scaled['label'],X_test_scaled['prediction'])

0.7158941190561744

In [404]:
accuracy_score(X_test_scaled['label'],X_test_scaled['prediction'])

0.8004878459079822

In [405]:
recall_score(X_test_scaled['label'],X_test_scaled['prediction'])

0.6838672768878719