hotel booking demand
====================
#### https://www.kaggle.com/jessemostipak/hotel-booking-demand   
   
+ Find a problem
+ Analyze previous approaches
+ Modeling your methodolody
+ Analyzing data
+ Visualization
+ Collaborative Evaluation  
<br/>



</br>

-----------------

In [1]:
!pip install pycountry
!pip install catboost

Collecting pycountry
  Downloading pycountry-20.7.3.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 11.6 MB/s 
[?25hBuilding wheels for collected packages: pycountry
  Building wheel for pycountry (setup.py) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-20.7.3-py2.py3-none-any.whl size=10746883 sha256=912465482070139d20a7e623119f3ab192c3cc56bb5017f1449d04b875807fb6
  Stored in directory: /root/.cache/pip/wheels/57/e8/3f/120ccc1ff7541c108bc5d656e2a14c39da0d824653b62284c6
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-20.7.3
Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


-----------
## Packages Import


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry as pc

pd.options.display.max_columns = None

In [3]:
## 데이터 불러오기
data = pd.read_csv('./hotel_bookings.csv')

In [4]:
## 데이터 확인
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


<br/>
</br>


--------------
## Data Pre-Processing

In [5]:
## 전처리를 위한 데이터 복사
df = data.copy()

In [6]:
## column의 null 수 확인
df.isnull().sum().sort_values(ascending=False)[:10]

company                     112593
agent                        16340
country                        488
children                         4
lead_time                        0
arrival_date_year                0
arrival_date_month               0
arrival_date_week_number         0
is_canceled                      0
market_segment                   0
dtype: int64

<br/>
</br>

##### 위 결과를 토대로 company, agent , country, children  column 에 결측치 처리가 필요

In [7]:
## agent, company column의 값이 null 이면 0으로 대체한다.
df[['agent','company']] = df[['agent','company']].fillna(0.0)

## country column의 값이 null 이면 가장 많이 나온 값으로 대체한다.
df['country'].fillna(data.country.mode().to_string(), inplace=True)

In [8]:
## children column 값이 null 이면 0으로 대체한다.
df['children']=df['children'].fillna(0.0)

## adult, baby , child 합이 0인 row를 drop한다.
df = df.drop(df[(df.adults+df.babies+df.children)==0].index)

In [9]:
## datatype을 int로 바꾼다
df[['children', 'company', 'agent']] = df[['children', 'company', 'agent']].astype('int64')

<br/>
</br>

##### 숙박 일수에 대한 전처리 필요
+ 총 숙박 일수 1박 미만, 무료로 호텔을 이용한 경우 : 이상치로 판단

In [10]:
## 총 숙박 일수가 1박 미만인 row를 제거한다.

## 총 숙박 일수 feature
df['total_staying_nights'] = df['stays_in_week_nights'] + df['stays_in_weekend_nights']

df = df[ df['total_staying_nights'] >= 1 ]
df = df.reset_index(drop=True)

In [11]:
## 값을 내고 호텔을 이용한 경우만 data로 사용한다.
df = df[ df['adr'] > 0 ].copy()

<br/>
</br>

### feature 생성

In [12]:
## 새로운 feature를 위해 데이터 복사
df_subset = df.copy()

In [13]:
## reservation_status column은 y값으로 쓸 is_canceled와 같은 값이니 제거
df_subset = df_subset.drop(['reservation_status'], axis=1)

## Room column 새로 추가.
## 예약된 방과 같으면 1, 아니면 0
df_subset['Room'] = 0
df_subset.loc[ df_subset['reserved_room_type'] == df_subset['assigned_room_type'] , 'Room'] = 1

## more canceled column 새로 추가.
## 예약 취소 비율이 높은 경우 1 , 아니면 0
df_subset['more_canceled'] = 0
df_subset.loc[ df_subset['previous_cancellations'] > df_subset['previous_bookings_not_canceled'] , 'not_canceled'] = 1

## reservation_status_date column 문자열 값을 year/month/date 값으로 분리
df_subset['reservation_status_date'] = pd.to_datetime(df_subset['reservation_status_date'])

df_subset['reservation_year'] = df_subset['reservation_status_date'].dt.year
df_subset['reservation_month'] = df_subset['reservation_status_date'].dt.month
df_subset['reservation_day'] = df_subset['reservation_status_date'].dt.day

## 남겨진 column 제거
df_subset = df_subset.drop(['booking_changes','assigned_room_type','reservation_status_date','distribution_channel'],axis=1)

<br/>
</br>

### Categorical variables을 숫자값으로 encoding

In [14]:
def transform(dataframe):

    ## LabelEncoder import
    from sklearn.preprocessing import LabelEncoder
    
    le = LabelEncoder()
    ## 모든 categorical var 지정.
    categorical_features = list(dataframe.columns[dataframe.dtypes == object])
    ## encoding 실행
    dataframe[categorical_features]=dataframe[categorical_features].apply(lambda x: le.fit_transform(x))
    return dataframe


## 모든 categorical var을 숫자형으로 바꾼다.
df_subset = transform(df_subset)

<br/>
</br>

### 정규화

In [15]:
##column의 분산값 확인
df_subset.var()

hotel                                 0.222716
is_canceled                           0.234348
lead_time                         11430.508914
arrival_date_year                     0.499289
arrival_date_month                   12.499659
arrival_date_week_number            184.281629
arrival_date_day_of_month            77.148962
stays_in_weekend_nights               0.989183
stays_in_week_nights                  3.572044
adults                                0.231569
children                              0.159791
babies                                0.009444
meal                                  1.138017
country                            2024.776043
market_segment                        1.520872
is_repeated_guest                     0.027086
previous_cancellations                0.720456
previous_bookings_not_canceled        2.091370
reserved_room_type                    2.864232
deposit_type                          0.113640
agent                             11500.539910
company      

<br/>
</br>

##### 위 분산 값을 토대로 정규화 할 column 선정

In [16]:
df_subset['lead_time']=np.log(df_subset['lead_time']+1)
df_subset['agent'] = np.log(df_subset['agent'] + 1)
df_subset['company'] = np.log(df_subset['company'] + 1)
df_subset['adr'] = np.log(df_subset['adr'] + 1)
df_subset['country'] = np.log(df_subset['country'] + 1)
df_subset['days_in_waiting_list'] = np.log(df_subset['days_in_waiting_list'] + 1)
df_subset['reservation_month'] = np.log(df_subset['reservation_month'] + 1)
df_subset['reservation_day'] = np.log(df_subset['reservation_day'] + 1)
df_subset['arrival_date_week_number'] = np.log(df_subset['arrival_date_week_number'] + 1)
df_subset['arrival_date_day_of_month'] = np.log(df_subset['arrival_date_day_of_month'] + 1)
df_subset['arrival_date_month'] = np.log(df_subset['arrival_date_month'] + 1)

df_subset = df_subset.drop([])

In [17]:
## 다시 확인
df_subset.var()

hotel                             0.222716
is_canceled                       0.234348
lead_time                         2.520964
arrival_date_year                 0.499289
arrival_date_month                0.575158
arrival_date_week_number          0.439545
arrival_date_day_of_month         0.506808
stays_in_weekend_nights           0.989183
stays_in_week_nights              3.572044
adults                            0.231569
children                          0.159791
babies                            0.009444
meal                              1.138017
country                           0.465148
market_segment                    1.520872
is_repeated_guest                 0.027086
previous_cancellations            0.720456
previous_bookings_not_canceled    2.091370
reserved_room_type                2.864232
deposit_type                      0.113640
agent                             3.496853
company                           1.311311
days_in_waiting_list              0.507638
customer_ty

<br/>
</br>


--------------
## Modeling

<br/>
</br>

### data split

In [18]:
def data_split(df, label):
    
    from sklearn.model_selection import train_test_split

    X = df.drop(label, axis=1)
    Y = df[label]

    x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.30)
    
    return x_train, x_test, y_train, y_test


## train, test 로 데이터 split
## y 값에 is_canceled
x_train, x_test, y_train, y_test = data_split(df_subset, 'is_canceled')

from sklearn.impute import SimpleImputer

## default, imputing 'mean' value
imputer = SimpleImputer() 
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

<br/>
</br>

### data train
+ decision tree
+ XGBoost
+ Gradient Boosting
+ AdaBoost
+ RandomForest
+ LogisticRegression
+ CatBoost


In [19]:
def train(x_train, y_train):
    from sklearn.tree import DecisionTreeClassifier
    from xgboost import XGBClassifier
    from sklearn.ensemble import GradientBoostingClassifier 
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from catboost import CatBoostClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from lightgbm import LGBMClassifier
    from sklearn.ensemble import HistGradientBoostingClassifier

    ## decision tree
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(x_train,y_train)


    ## XGBoost
    xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 500)
    xgb.fit(x_train, y_train)

    
    ## Gradient Boosting
    gb = GradientBoostingClassifier()
    gb.fit(x_train, y_train)

    ## AdaBoost
    ada = AdaBoostClassifier(base_estimator = clf)
    ada.fit(x_train, y_train)

    ## RandomForest
    rd_clf = RandomForestClassifier(n_estimators=300)
    rd_clf.fit(x_train, y_train)    

    ## LogisticRegression
    lr = LogisticRegression()
    lr.fit(x_train, y_train)

    ## CatBoost
    cat = CatBoostClassifier(iterations=300)
    cat.fit(x_train, y_train)

    ##LGBM
    lgbm = LGBMClassifier(learning_rate = 1)
    lgbm.fit(x_train, y_train)

    ##ExtraTreesClassifier
    etc = ExtraTreesClassifier()
    etc.fit(x_train, y_train)

    ##HistGradientBoostingClasifier
    hgbc = HistGradientBoostingClassifier(random_state=0)
    hgbc.fit(x_train,y_train)

    return clf, xgb,gb,ada,rd_clf,lr,cat,lgbm,etc,hgbc


clf,xgb,gb,ada,rd_clf,lr,cat,lgbm,etc,hgbc = train(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Learning rate set to 0.204182
0:	learn: 0.5522438	total: 82.5ms	remaining: 24.7s
1:	learn: 0.4714694	total: 107ms	remaining: 15.9s
2:	learn: 0.4291059	total: 134ms	remaining: 13.2s
3:	learn: 0.4031615	total: 160ms	remaining: 11.8s
4:	learn: 0.3823115	total: 184ms	remaining: 10.8s
5:	learn: 0.3593003	total: 211ms	remaining: 10.4s
6:	learn: 0.3295090	total: 238ms	remaining: 9.97s
7:	learn: 0.3195925	total: 263ms	remaining: 9.59s
8:	learn: 0.3051550	total: 287ms	remaining: 9.29s
9:	learn: 0.2967529	total: 312ms	remaining: 9.04s
10:	learn: 0.2884833	total: 334ms	remaining: 8.78s
11:	learn: 0.2701853	total: 360ms	remaining: 8.64s
12:	learn: 0.2485232	total: 385ms	remaining: 8.51s
13:	learn: 0.2459800	total: 409ms	remaining: 8.35s
14:	learn: 0.2418864	total: 439ms	remaining: 8.34s
15:	learn: 0.2312259	total: 463ms	remaining: 8.22s
16:	learn: 0.2158573	total: 488ms	remaining: 8.12s
17:	learn: 0.2007114	total: 512ms	remaining: 8.02s
18:	learn: 0.1872196	total: 537ms	remaining: 7.94s
19:	learn:

<br/>
</br>

### 정확도 확인

In [20]:
def Score(clf,x_train,y_train,x_test,y_test):
    train_score = clf.score(x_train,y_train)
    test_score = clf.score(x_test,y_test)

    print("========================================")
    print(f'Training Accuracy of our model is: {train_score}')
    print(f'Test Accuracy of our model is: {test_score}')
    print("========================================")
    
    
Score(clf,x_train,y_train,x_test,y_test)

Training Accuracy of our model is: 1.0
Test Accuracy of our model is: 0.9462237365133447


<br/>
</br>

##### 모델 별 비교

In [21]:
models = pd.DataFrame({
    'Model' : ['Decision Tree Classifier', 'Random Forest Classifier', 
               'XgBoost',  'Cat Boost', 'Gradient Boosting Classifier','Ada Boost Classifier', 'Linear regression',"LightGBMClassifier","ExtraTreesClassifier","HistGradientBoostingClassifier"],

    'Score' : [clf.score(x_test,y_test),rd_clf.score(x_test,y_test), xgb.score(x_test,y_test), cat.score(x_test,y_test) , gb.score(x_test,y_test) , ada.score(x_test,y_test) , lr.score(x_test,y_test),lgbm.score(x_test,y_test),etc.score(x_test,y_test),hgbc.score(x_test,y_test)]
})

models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
2,XgBoost,0.999205
3,Cat Boost,0.998921
9,HistGradientBoostingClassifier,0.987252
7,LightGBMClassifier,0.968569
1,Random Forest Classifier,0.956729
8,ExtraTreesClassifier,0.95653
5,Ada Boost Classifier,0.947047
0,Decision Tree Classifier,0.946224
4,Gradient Boosting Classifier,0.918768
6,Linear regression,0.791369
