In [1]:
!pip install pycountry
!pip install catboost

Collecting pycountry
  Downloading pycountry-20.7.3.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 9.4 MB/s 
[?25hBuilding wheels for collected packages: pycountry
  Building wheel for pycountry (setup.py) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-20.7.3-py2.py3-none-any.whl size=10746883 sha256=f04a349cff6641e7ee1c9ce7c405b7bc56c4253e97e234eb0de8edc0dda3f015
  Stored in directory: /root/.cache/pip/wheels/57/e8/3f/120ccc1ff7541c108bc5d656e2a14c39da0d824653b62284c6
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-20.7.3
Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 56 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry as pc

pd.options.display.max_columns = None

In [3]:
## Importing Data
data = pd.read_csv('./data/hotel_bookings.csv')

In [4]:
## Show the first 5 rows of Data
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [5]:
data.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [6]:
## Copy the dataset
df = data.copy()

In [7]:
## Find the missing value, show the total null values for each column and sort it in descending order
df.isnull().sum().sort_values(ascending=False)[:10]

company                     112593
agent                        16340
country                        488
children                         4
lead_time                        0
arrival_date_year                0
arrival_date_month               0
arrival_date_week_number         0
is_canceled                      0
market_segment                   0
dtype: int64

### agent,company,country,children

In [8]:
## Drop Rows where there is no adult, baby and child
df = df.drop(df[(df.adults+df.babies+df.children)==0].index)


## If no id of agent or company is null, just replace it with 0
df[['agent','company']] = df[['agent','company']].fillna(0.0)


## For the missing values in the country column, replace it with mode (value that appears most often)
df['country'].fillna(data.country.mode().to_string(), inplace=True)


## for missing children value, replace it with rounded mean value
df['children']=df['children'].fillna(0.0)

In [9]:
## convert datatype of these columns from float to integer
df[['children', 'company', 'agent']] = df[['children', 'company', 'agent']].astype('int64')

### 숙박 일수 1박 미만

In [10]:
# 총 숙박 일수가 1박 미만인 row 제거

# 총 숙박 일수 feature 생성
df['total_number_of_staying_nights'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']

# 총 숙박 일수가 1박 이상인 항목만 선택
df = df[ df['total_number_of_staying_nights'] >= 1 ]
df = df.reset_index(drop=True)

In [11]:
# adr이 0 이하인 row 제거
df = df[ df['adr'] > 0 ].copy()

In [12]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,total_number_of_staying_nights
0,Resort Hotel,0,7,2015,July,27,1,0,1,1,0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,0,0,0,Transient,75.0,0,0,Check-Out,2015-07-02,1
1,Resort Hotel,0,13,2015,July,27,1,0,1,1,0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304,0,0,Transient,75.0,0,0,Check-Out,2015-07-02,1
2,Resort Hotel,0,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240,0,0,Transient,98.0,0,1,Check-Out,2015-07-03,2
3,Resort Hotel,0,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240,0,0,Transient,98.0,0,1,Check-Out,2015-07-03,2
4,Resort Hotel,0,0,2015,July,27,1,0,2,2,0,0,BB,PRT,Direct,Direct,0,0,0,C,C,0,No Deposit,0,0,0,Transient,107.0,0,0,Check-Out,2015-07-03,2


In [13]:
## Copy the dataframe
df_subset = df.copy()

In [14]:
## Remove reservation_status column
## because it tells us if booking was cancelled 
df_subset = df_subset.drop(['reservation_status'], axis=1)

## Make the new column which contain 1 if guest received the same room which was reserved otherwise 0
df_subset['Room'] = 0
df_subset.loc[ df_subset['reserved_room_type'] == df_subset['assigned_room_type'] , 'Room'] = 1


## Make the new column which contain 1 if the guest has cancelled more booking in the past
## than the number of booking he did not cancel, otherwise 0

df_subset['net_cancelled'] = 0
df_subset.loc[ df_subset['previous_cancellations'] > df_subset['previous_bookings_not_canceled'] , 'net_cancelled'] = 1

df_subset['reservation_status_date'] = pd.to_datetime(df_subset['reservation_status_date'])

df_subset['year'] = df_subset['reservation_status_date'].dt.year
df_subset['month'] = df_subset['reservation_status_date'].dt.month
df_subset['day'] = df_subset['reservation_status_date'].dt.day

df_subset = df_subset.drop(['booking_changes','assigned_room_type','reservation_status_date','distribution_channel'],axis=1)
# 'arrival_date_year','arrival_date_week_number','arrival_date_day_of_month','arrival_date_month'

In [15]:
def transform(dataframe):
    
    
    ## Import LabelEncoder from sklearn
    from sklearn.preprocessing import LabelEncoder
    
    le = LabelEncoder()
    
    
    ## Select all categorcial features
    categorical_features = list(dataframe.columns[dataframe.dtypes == object])
    
    
    ## Apply Label Encoding on all categorical features
    dataframe[categorical_features]=dataframe[categorical_features].apply(lambda x: le.fit_transform(x))
    return dataframe

df_subset = transform(df_subset)

In [16]:
df_subset.var()

hotel                                 0.222716
is_canceled                           0.234348
lead_time                         11430.508914
arrival_date_year                     0.499289
arrival_date_month                   12.499659
arrival_date_week_number            184.281629
arrival_date_day_of_month            77.148962
stays_in_weekend_nights               0.989183
stays_in_week_nights                  3.572044
adults                                0.231569
children                              0.159791
babies                                0.009444
meal                                  1.138017
country                            2024.776043
market_segment                        1.520872
is_repeated_guest                     0.027086
previous_cancellations                0.720456
previous_bookings_not_canceled        2.091370
reserved_room_type                    2.864232
deposit_type                          0.113640
agent                             11500.539910
company      

In [17]:
df_subset['lead_time']=np.log(df_subset['lead_time']+1)
df_subset['agent'] = np.log(df_subset['agent'] + 1)
df_subset['company'] = np.log(df_subset['company'] + 1)
df_subset['adr'] = np.log(df_subset['adr'] + 1)
df_subset['country'] = np.log(df_subset['country'] + 1)
df_subset['days_in_waiting_list'] = np.log(df_subset['days_in_waiting_list'] + 1)
df_subset['month'] = np.log(df_subset['month'] + 1)
df_subset['day'] = np.log(df_subset['day'] + 1)
df_subset['arrival_date_week_number'] = np.log(df_subset['arrival_date_week_number'] + 1)
df_subset['arrival_date_day_of_month'] = np.log(df_subset['arrival_date_day_of_month'] + 1)
df_subset['arrival_date_month'] = np.log(df_subset['arrival_date_month'] + 1)

df_subset = df_subset.drop([])

In [18]:
df_subset.var()

hotel                             0.222716
is_canceled                       0.234348
lead_time                         2.520964
arrival_date_year                 0.499289
arrival_date_month                0.575158
arrival_date_week_number          0.439545
arrival_date_day_of_month         0.506808
stays_in_weekend_nights           0.989183
stays_in_week_nights              3.572044
adults                            0.231569
children                          0.159791
babies                            0.009444
meal                              1.138017
country                           0.465148
market_segment                    1.520872
is_repeated_guest                 0.027086
previous_cancellations            0.720456
previous_bookings_not_canceled    2.091370
reserved_room_type                2.864232
deposit_type                      0.113640
agent                             3.496853
company                           1.311311
days_in_waiting_list              0.507638
customer_ty

In [19]:
df_subset['adr'] = df_subset['adr'].fillna(value = df_subset['adr'].mean())

In [20]:
df_subset.shape

(117399, 33)

In [21]:
def data_split(df, label):
    
    from sklearn.model_selection import train_test_split

    X = df.drop(label, axis=1)
    Y = df[label]

    x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.30)
    
    return x_train, x_test, y_train, y_test



x_train, x_test, y_train, y_test = data_split(df_subset, 'is_canceled')

from sklearn.impute import SimpleImputer

## default, imputing 'mean' value
imputer = SimpleImputer() 
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

In [22]:
def train(x_train, y_train):
    from sklearn.tree import DecisionTreeClassifier
    from xgboost import XGBClassifier
    from sklearn.ensemble import GradientBoostingClassifier 
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from catboost import CatBoostClassifier

    #85퍼
    clf = DecisionTreeClassifier(random_state=0)
    # clf.fit(x_train,y_train)


    #86퍼
    # xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 500)
    # xgb.fit(x_train, y_train)
    
    # gb = GradientBoostingClassifier()
    # gb.fit(x_train, y_train)

    #86퍼
    # ada = AdaBoostClassifier(base_estimator = clf)
    # ada.fit(x_train, y_train)

    #88 퍼
    # rd_clf = RandomForestClassifier(n_estimators=300)
    # rd_clf.fit(x_train, y_train)    

    #80퍼
    # lr = LogisticRegression()
    # lr.fit(x_train, y_train)

    cat = CatBoostClassifier(iterations=300)
    cat.fit(x_train, y_train)

    return cat


clf = train(x_train, y_train)

Learning rate set to 0.204182
0:	learn: 0.5479512	total: 87.4ms	remaining: 26.1s
1:	learn: 0.4775545	total: 117ms	remaining: 17.5s
2:	learn: 0.4321732	total: 148ms	remaining: 14.6s
3:	learn: 0.4068577	total: 174ms	remaining: 12.9s
4:	learn: 0.3840794	total: 209ms	remaining: 12.3s
5:	learn: 0.3551343	total: 235ms	remaining: 11.5s
6:	learn: 0.3408316	total: 274ms	remaining: 11.5s
7:	learn: 0.3148264	total: 302ms	remaining: 11s
8:	learn: 0.3076143	total: 336ms	remaining: 10.8s
9:	learn: 0.3007340	total: 364ms	remaining: 10.5s
10:	learn: 0.2842781	total: 391ms	remaining: 10.3s
11:	learn: 0.2722910	total: 422ms	remaining: 10.1s
12:	learn: 0.2600858	total: 453ms	remaining: 9.99s
13:	learn: 0.2515848	total: 480ms	remaining: 9.8s
14:	learn: 0.2314625	total: 508ms	remaining: 9.64s
15:	learn: 0.2206376	total: 533ms	remaining: 9.46s
16:	learn: 0.2053632	total: 557ms	remaining: 9.28s
17:	learn: 0.1961543	total: 585ms	remaining: 9.16s
18:	learn: 0.1907368	total: 609ms	remaining: 9.01s
19:	learn: 0.

In [23]:
def Score(clf,x_train,y_train,x_test,y_test):
    train_score = clf.score(x_train,y_train)
    test_score = clf.score(x_test,y_test)

    print("========================================")
    print(f'Training Accuracy of our model is: {train_score}')
    print(f'Test Accuracy of our model is: {test_score}')
    print("========================================")
    
    
Score(clf,x_train,y_train,x_test,y_test)

Training Accuracy of our model is: 0.999914820087857
Test Accuracy of our model is: 0.9990630323679728
