In [154]:
# Linear algebra
import numpy as np 

# Data processing
import pandas as pd 
import datetime
import pycountry_convert as pc
from sklearn.model_selection import train_test_split

# Data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
%matplotlib inline
fig=plt.figure(figsize=(18,6))

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from scipy import stats

# Display options
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 300

<Figure size 1296x432 with 0 Axes>

In [155]:
# Load raw dataset
hotel_data_raw = pd.read_csv('./data/hotel_bookings.csv')
hotel_data_raw.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [156]:
hotel_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
hotel                             119390 non-null object
is_canceled                       119390 non-null int64
lead_time                         119390 non-null int64
arrival_date_year                 119390 non-null int64
arrival_date_month                119390 non-null object
arrival_date_week_number          119390 non-null int64
arrival_date_day_of_month         119390 non-null int64
stays_in_weekend_nights           119390 non-null int64
stays_in_week_nights              119390 non-null int64
adults                            119390 non-null int64
children                          119386 non-null float64
babies                            119390 non-null int64
meal                              119390 non-null object
country                           118902 non-null object
market_segment                    119390 non-null object
distribution_channel              119390 n

In [157]:
# Check all numerical variables
print('Numerical features:')
print(list(hotel_data_raw.select_dtypes([np.number]).columns))
print('Some of them are not in appropriate type. We will convert them to strings.')
# Change data types from number to string
num_to_str = ['is_canceled','arrival_date_year','arrival_date_week_number','arrival_date_day_of_month','is_repeated_guest']
for feature in num_to_str:
    hotel_data_raw[feature] = hotel_data_raw[feature].astype(str)

Numerical features:
['is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']
Some of them are not in appropriate type. We will convert them to strings.


In [158]:
hotel_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
hotel                             119390 non-null object
is_canceled                       119390 non-null object
lead_time                         119390 non-null int64
arrival_date_year                 119390 non-null object
arrival_date_month                119390 non-null object
arrival_date_week_number          119390 non-null object
arrival_date_day_of_month         119390 non-null object
stays_in_weekend_nights           119390 non-null int64
stays_in_week_nights              119390 non-null int64
adults                            119390 non-null int64
children                          119386 non-null float64
babies                            119390 non-null int64
meal                              119390 non-null object
country                           118902 non-null object
market_segment                    119390 non-null object
distribution_channel              1193

In [159]:
print('We see the numerical features have different range and scale. We have to normalize it for modelling.')
hotel_data_raw.describe()

We see the numerical features have different range and scale. We have to normalize it for modelling.


Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,104.011416,0.927599,2.500302,1.856403,0.10389,0.007949,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,106.863097,0.998613,1.908286,0.579261,0.398561,0.097436,0.844336,1.497437,0.652306,110.774548,131.655015,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,18.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,69.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.575,0.0,0.0
75%,160.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,737.0,19.0,50.0,55.0,10.0,10.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [160]:
# Normalization (using minmax) for all numerical features
for num_feature in hotel_data_raw.select_dtypes([np.number]).columns:
    minmax_scaler = preprocessing.MinMaxScaler().fit(hotel_data_raw[[num_feature]])
    hotel_data_raw[num_feature] = minmax_scaler.transform(hotel_data_raw[[num_feature]])

In [161]:
print('Normalized results:')
hotel_data_raw.describe()

Normalized results:


Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.141128,0.048821,0.050006,0.033753,0.010389,0.000795,0.003351,0.001904,0.01053,0.160474,0.341279,0.005936,0.020015,0.007815,0.114273
std,0.144997,0.052559,0.038166,0.010532,0.039856,0.009744,0.032474,0.020798,0.031062,0.207443,0.245168,0.044999,0.009347,0.030661,0.15856
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.024423,0.0,0.02,0.036364,0.0,0.0,0.0,0.0,0.0,0.014981,0.104283,0.0,0.013996,0.0,0.0
50%,0.093623,0.052632,0.04,0.036364,0.0,0.0,0.0,0.0,0.0,0.024345,0.32216,0.0,0.018673,0.0,0.0
75%,0.217096,0.105263,0.06,0.036364,0.0,0.0,0.0,0.0,0.0,0.426966,0.49162,0.0,0.024486,0.0,0.2
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [162]:
# Handle missing values

# Fill missing values in 'children' with mode
hotel_data_raw['children'].fillna(hotel_data_raw['children'].value_counts().index[0],inplace = True)

# Fill missing values in 'country' with mode
hotel_data_raw['country'].fillna(hotel_data_raw['country'].value_counts().index[0],inplace = True)

# Do nothing for missing values in 'agent' and 'company' as they take meanings

In [163]:
# Add new feature 'is_comapny' 
hotel_data_raw['is_company'] = (hotel_data_raw['company'].notnull())

# Add new feature 'is_agent'
hotel_data_raw['is_agent'] = (hotel_data_raw['agent'].notnull())

# Add new feature 'is_diff_room_type'
hotel_data_raw['is_diff_room_type'] = (hotel_data_raw['reserved_room_type'] != hotel_data_raw['assigned_room_type'])

# Add new feature 'arrival_date_day_of_the_week'
hotel_data_raw_dates = hotel_data_raw['arrival_date_year'] + ' ' + hotel_data_raw['arrival_date_month'] + ' ' + hotel_data_raw['arrival_date_day_of_month']
hotel_data_raw['arrival_date_day_of_the_week'] = [datetime.datetime.strptime(date, '%Y %B %d').strftime('%A') for date in hotel_data_raw_dates]

In [164]:
print('Progress check:')
hotel_data_raw.info()

Progress check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 36 columns):
hotel                             119390 non-null object
is_canceled                       119390 non-null object
lead_time                         119390 non-null float64
arrival_date_year                 119390 non-null object
arrival_date_month                119390 non-null object
arrival_date_week_number          119390 non-null object
arrival_date_day_of_month         119390 non-null object
stays_in_weekend_nights           119390 non-null float64
stays_in_week_nights              119390 non-null float64
adults                            119390 non-null float64
children                          119390 non-null float64
babies                            119390 non-null float64
meal                              119390 non-null object
country                           119390 non-null object
market_segment                    119390 non-null object
distribution

Index(['hotel', 'is_canceled', 'arrival_date_year', 'arrival_date_month',
       'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'reserved_room_type', 'assigned_room_type',
       'deposit_type', 'customer_type', 'reservation_status',
       'reservation_status_date', 'arrival_date_day_of_the_week'],
      dtype='object')

In [165]:
print('Here are the srting features.')
print(list(hotel_data_raw.select_dtypes([np.object]).columns))
print('We will one-hot encode some categorical features that are useful and leave those unuseful or boolean ones alone.')

# Get dummy variables 
to_get_dummies = ['hotel','arrival_date_month','meal','market_segment','distribution_channel','customer_type','deposit_type']
hotel_data_raw = pd.get_dummies(hotel_data_raw, columns=to_get_dummies)

Here are the srting features.
['hotel', 'is_canceled', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status', 'reservation_status_date', 'arrival_date_day_of_the_week']
We will one-hot encode some categorical features that are useful and leave those unuseful or boolean ones alone.


In [166]:
# DO NOT execute this.
#le = preprocessing.LabelEncoder()
#to_be_encoded = ['hotel', 'arrival_date_month','meal','market_segment', 'distribution_channel','deposit_type', 'customer_type']
#for feature in to_be_encoded:
    #hotel_data_raw['{}_encoded'.format(feature)] = le.fit_transform(hotel_data_raw[feature])

In [167]:
print('Progress check:')
hotel_data_raw.info()
hotel_data_raw.head()

Progress check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 68 columns):
is_canceled                       119390 non-null object
lead_time                         119390 non-null float64
arrival_date_year                 119390 non-null object
arrival_date_week_number          119390 non-null object
arrival_date_day_of_month         119390 non-null object
stays_in_weekend_nights           119390 non-null float64
stays_in_week_nights              119390 non-null float64
adults                            119390 non-null float64
children                          119390 non-null float64
babies                            119390 non-null float64
country                           119390 non-null object
is_repeated_guest                 119390 non-null object
previous_cancellations            119390 non-null float64
previous_bookings_not_canceled    119390 non-null float64
reserved_room_type                119390 non-null object
assigned_r

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,country,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,is_company,is_agent,is_diff_room_type,arrival_date_day_of_the_week,hotel_City Hotel,hotel_Resort Hotel,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,arrival_date_month_November,arrival_date_month_October,arrival_date_month_September,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable
0,0,0.464043,2015,27,1,0.0,0.0,0.036364,0.0,0.0,PRT,0,0.0,0.0,C,C,0.142857,,,0.0,0.00118,0.0,0.0,Check-Out,2015-07-01,False,False,False,Wednesday,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0
1,0,1.0,2015,27,1,0.0,0.0,0.036364,0.0,0.0,PRT,0,0.0,0.0,C,C,0.190476,,,0.0,0.00118,0.0,0.0,Check-Out,2015-07-01,False,False,False,Wednesday,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0
2,0,0.009498,2015,27,1,0.0,0.02,0.018182,0.0,0.0,GBR,0,0.0,0.0,A,C,0.0,,,0.0,0.015053,0.0,0.0,Check-Out,2015-07-02,False,False,True,Wednesday,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0
3,0,0.017639,2015,27,1,0.0,0.02,0.018182,0.0,0.0,GBR,0,0.0,0.0,A,A,0.0,0.567416,,0.0,0.015053,0.0,0.0,Check-Out,2015-07-02,False,True,False,Wednesday,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
4,0,0.018996,2015,27,1,0.0,0.04,0.036364,0.0,0.0,GBR,0,0.0,0.0,A,A,0.0,0.447566,,0.0,0.019307,0.0,0.2,Check-Out,2015-07-03,False,True,False,Wednesday,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0


In [168]:
# Add new feature 'continent' with the information provided in 'country'

continents = []

continent_name_code = {
    'NA': 'North America',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    'AQ': 'Antarctica',
    'EU': 'Europe',
    
}

for country in hotel_data_raw["country"]:
    try:
        if country == 'CN':
            country_code == 'CN'
        elif country == 'TMP':
            country_code == 'TL'
        else:
            country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")

    except:
        print(country,'is not valid')
    
    try:
        #Antarctica
        if country_code == 'AQ':
            continent = 'AQ'
        #United States Minor Outlying Islands
        elif country_code == 'UM':
            continent = 'NA'
        #French Southern and Antarctic Lands
        elif country_code == 'TF':
            continent = 'EU'
        else:
            continent = pc.country_alpha2_to_continent_code(country_code)
    except:
        print(country_code,'is not valid for continent')

        
    continents.append(continent_name_code[continent])
    

hotel_data_raw['continent'] = continents



In [169]:
print('Progress check:')
hotel_data_raw.info()
hotel_data_raw[['country','continent']].head(5)

Progress check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 69 columns):
is_canceled                       119390 non-null object
lead_time                         119390 non-null float64
arrival_date_year                 119390 non-null object
arrival_date_week_number          119390 non-null object
arrival_date_day_of_month         119390 non-null object
stays_in_weekend_nights           119390 non-null float64
stays_in_week_nights              119390 non-null float64
adults                            119390 non-null float64
children                          119390 non-null float64
babies                            119390 non-null float64
country                           119390 non-null object
is_repeated_guest                 119390 non-null object
previous_cancellations            119390 non-null float64
previous_bookings_not_canceled    119390 non-null float64
reserved_room_type                119390 non-null object
assigned_r

Unnamed: 0,country,continent
0,PRT,Europe
1,PRT,Europe
2,GBR,Europe
3,GBR,Europe
4,GBR,Europe


In [170]:
#hotel_data_raw['continent_encoded'] = le.fit_transform(hotel_data_raw['continent'])
# Get dummy variables for 'continent'
hotel_data_raw = pd.get_dummies(hotel_data_raw, columns=['continent'])

In [171]:
print('Progress check:')
hotel_data_raw.info()

Progress check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 75 columns):
is_canceled                       119390 non-null object
lead_time                         119390 non-null float64
arrival_date_year                 119390 non-null object
arrival_date_week_number          119390 non-null object
arrival_date_day_of_month         119390 non-null object
stays_in_weekend_nights           119390 non-null float64
stays_in_week_nights              119390 non-null float64
adults                            119390 non-null float64
children                          119390 non-null float64
babies                            119390 non-null float64
country                           119390 non-null object
is_repeated_guest                 119390 non-null object
previous_cancellations            119390 non-null float64
previous_bookings_not_canceled    119390 non-null float64
reserved_room_type                119390 non-null object
assigned_r

In [172]:
# Show correlation
hotel_data_raw.corr()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,is_company,is_agent,is_diff_room_type,hotel_City Hotel,hotel_Resort Hotel,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,arrival_date_month_November,arrival_date_month_October,arrival_date_month_September,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,continent_Africa,continent_Antarctica,continent_Asia,continent_Australia,continent_Europe,continent_North America,continent_South America
lead_time,1.0,0.085671,0.165799,0.119519,-0.037613,-0.020915,0.086042,-0.073548,0.000149,-0.069741,0.151464,0.170084,-0.063077,-0.116451,-0.095712,-0.12628,0.180031,-0.14222,0.075381,-0.075381,-0.027514,0.058121,-0.069918,-0.141459,-0.125571,0.10414,0.072029,-0.09481,0.042237,-0.063054,0.057043,0.094956,-0.038259,0.009714,0.136633,-0.097617,-0.015512,-0.041554,-0.067184,-0.165075,-0.174111,0.346275,0.146307,-0.186304,-0.003926,-0.134609,-0.161594,-0.031365,0.221734,-0.004906,0.068541,-0.031924,-0.173959,0.159548,-0.380233,0.380179,0.016587,-0.053859,-0.003199,-0.069917,0.012436,0.094008,-0.035932,-0.035104
stays_in_weekend_nights,0.085671,1.0,0.498969,0.091871,0.045794,0.018483,-0.012775,-0.042715,0.063281,0.140739,0.066749,-0.054151,0.049342,-0.018554,0.072671,-0.107542,0.126434,-0.076657,-0.186596,0.186596,0.006758,0.04825,-0.015371,-0.020077,-0.033927,0.06646,-0.002297,-0.020628,-0.019211,-0.035051,-0.010373,0.004494,-0.066769,0.017476,0.105243,-0.035044,0.022202,0.007379,-0.04567,-0.105547,-0.024573,-0.062016,0.063606,0.060176,-0.001753,-0.087007,-0.036528,-0.013576,0.084595,0.000469,0.102038,-0.007163,0.019485,-0.06492,0.113537,-0.114275,0.001761,0.056848,-0.003802,-0.010798,-0.0083,-0.012025,-0.010121,0.010175
stays_in_week_nights,0.165799,0.498969,1.0,0.092976,0.044203,0.020191,-0.013992,-0.048743,0.096209,0.182382,0.182211,-0.00202,0.065237,-0.024859,0.068192,-0.08137,0.117194,-0.091036,-0.23402,0.23402,-0.013773,0.063877,-0.018359,-0.045471,-0.036919,0.055093,0.025822,0.009454,-0.016199,-0.012392,-0.045361,0.003917,-0.062256,0.015285,0.121446,-0.063031,0.032146,0.000338,-0.049937,-0.095474,-0.027135,-0.069309,0.09254,0.041042,-0.003218,-0.088032,-0.026651,-0.023879,0.077869,-0.00034,0.133082,-0.016182,0.007299,-0.064281,0.07886,-0.079999,0.006791,0.066408,-0.001073,-0.038768,-0.012346,0.027793,-0.042787,-0.007358
adults,0.119519,0.091871,0.092976,1.0,0.03044,0.018146,-0.006738,-0.107983,-0.051673,-0.035594,0.207793,-0.008283,0.230641,0.014785,0.122884,-0.235137,0.186693,-0.062009,-0.013183,0.013183,0.010977,0.091751,-0.012771,-0.030384,-0.065403,0.077962,0.002296,-0.026695,-0.016956,-0.070182,-0.02178,0.008876,-0.03745,0.014479,0.04905,-0.003333,-0.0056,-0.065612,-0.05062,-0.230702,0.011442,-0.045833,-0.032371,0.162095,0.004548,-0.225235,-0.007627,-0.053372,0.14666,0.003839,0.020488,0.058976,0.091006,-0.11576,0.02848,-0.02898,0.003246,-0.025114,-0.006051,0.005158,0.006142,-0.018525,0.005498,0.037175
children,-0.037613,0.045794,0.044203,0.03044,1.0,0.02403,-0.024729,-0.021072,0.048952,0.04107,0.030931,-0.033271,0.324853,0.056255,0.081736,-0.05452,0.045706,-0.013072,-0.04421,0.04421,-0.000796,0.087758,0.002874,-0.004032,-0.01586,0.068712,-0.005785,-0.024313,-0.026763,-0.038718,-0.032948,-0.036617,0.037349,-0.001264,0.015747,-0.067787,-0.013754,-0.011625,-0.004326,-0.050743,0.064175,-0.112987,-0.097365,0.145759,-0.001067,-0.058046,0.050134,-0.010489,-0.007018,0.001561,-0.018105,-0.006649,0.096145,-0.092857,0.096965,-0.096668,-0.006754,0.028418,-0.001067,0.015972,-0.001412,-0.043801,0.022785,0.024757
babies,-0.020915,0.018483,0.020191,0.018146,0.02403,1.0,-0.007501,-0.00655,0.08344,0.036184,0.019206,-0.010621,0.029186,0.037383,0.097889,-0.013366,-0.006782,0.01857,-0.043434,0.043434,-0.010408,0.022985,0.008954,-0.001757,0.000345,0.005685,-0.004753,-0.006531,-0.006259,-0.003341,-0.007887,0.001054,-0.00898,0.018628,0.017661,-0.014372,0.005856,-0.003638,0.017592,-0.010893,0.052813,-0.033383,-0.003102,-0.003086,-0.000334,-0.011999,0.051253,-0.003283,-0.036216,-0.000528,-0.000189,0.000513,0.021595,-0.022925,0.030627,-0.030435,-0.003007,0.005362,-0.000334,-0.009345,-0.002738,0.00558,-0.005861,0.002961
previous_cancellations,0.086042,-0.012775,-0.013992,-0.006738,-0.024729,-0.007501,1.0,0.152728,-0.026993,-0.012488,-0.184574,0.005929,-0.065646,-0.018492,-0.048384,0.021229,-0.029454,-0.030138,-0.012292,0.012292,-0.016885,-0.008076,0.001729,-0.003789,0.000615,0.012018,-0.024654,-0.021228,-0.026974,-0.014083,0.025415,0.074321,-0.019445,0.143841,0.014594,-0.028358,-0.002603,-0.001704,0.01227,0.020561,-0.02793,0.098718,0.016284,-0.079738,-0.000422,0.054929,-0.026364,-0.004152,-0.00989,-0.000668,0.030806,-0.004186,-0.026525,0.015144,-0.14221,0.143314,-0.003803,-0.008539,-0.000422,-0.017186,-0.006785,0.029279,-0.014509,-0.013518
previous_bookings_not_canceled,-0.073548,-0.042715,-0.048743,-0.107983,-0.021072,-0.00655,0.152728,1.0,0.011608,0.023252,-0.208557,-0.009397,-0.072144,0.047653,0.037824,0.270518,-0.188226,0.043914,-0.004441,0.004441,-0.001797,-0.019695,0.007891,0.007151,0.025597,-0.002467,0.003942,0.009443,0.004021,0.003684,-0.012701,-0.012964,0.041908,-0.006069,-0.028523,-0.024257,-0.008479,0.008735,0.101223,0.276428,1.4e-05,-0.036085,-0.03841,-0.072887,-0.000375,0.257857,0.012485,-0.003406,-0.164427,-0.000593,-0.016689,0.021285,0.03054,-0.028603,0.031687,-0.031509,-0.002919,-0.006629,-0.000375,-0.014215,-0.005935,0.02484,-0.011536,-0.012894
booking_changes,0.000149,0.063281,0.096209,-0.051673,0.048952,0.08344,-0.026993,0.011608,1.0,0.06701,0.122098,-0.011634,0.019618,0.06562,0.052833,0.080742,-0.074787,0.093958,-0.07282,0.07282,0.002254,0.024257,0.005369,-0.012074,0.009336,-0.00182,-0.014059,0.000342,-0.001992,0.010403,-0.009384,-0.011402,-0.035107,0.010171,0.056152,-0.023379,0.022496,0.002768,0.013504,0.028451,0.091988,-0.006375,-0.046186,-0.028764,-0.001387,0.053343,0.101558,-0.007887,-0.117698,-0.002194,-0.029225,0.007853,-0.088456,0.105576,0.116419,-0.119333,0.020991,0.021844,-0.001387,0.005017,0.003597,-0.028062,0.023848,0.008699
agent,-0.069741,0.140739,0.182382,-0.035594,0.04107,0.036184,-0.012488,0.023252,0.06701,1.0,0.350746,-0.055151,-0.024695,0.177353,0.034162,0.048073,,0.086653,-0.790232,0.790232,0.009362,0.002031,0.008035,0.000678,0.028667,0.010583,0.011448,0.005513,0.006698,0.022304,-0.039114,-0.052554,0.011381,0.020257,0.137917,-0.20987,0.112688,0.009324,0.003769,0.088318,0.075904,-0.044312,0.00869,-0.027834,,0.121499,0.094634,0.041468,-0.140338,-0.00299,-0.053339,0.013783,0.00198,0.021477,0.104147,-0.105642,0.020059,-0.01933,-0.00299,-0.063499,-0.004231,0.069303,-0.018017,-0.027421


In [173]:
#hotel_data_raw.to_csv('Hotel_Bookings_Prepared_new.csv')

In [174]:
hotel_data_raw['is_canceled'].value_counts()

0    75166
1    44224
Name: is_canceled, dtype: int64