In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [2]:
# read the dataset
data = pd.read_csv("clean_hotel_bookings.csv", index_col='Unnamed: 0')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119390 entries, 0 to 119389
Data columns (total 55 columns):
hotel                                 119390 non-null int64
is_canceled                           119390 non-null int64
is_repeated_guest                     119390 non-null int64
reserved_room_type                    119390 non-null object
assigned_room_type                    119390 non-null object
country_is_FRA                        119390 non-null int64
country_is_DEU                        119390 non-null int64
country_is_other                      119390 non-null int64
reserved_assigned_room_type_match     119390 non-null int64
fall                                  119390 non-null int64
spring                                119390 non-null int64
summer                                119390 non-null int64
winter                                119390 non-null int64
distribution_channel_Corporate        119390 non-null int64
distribution_channel_Direct           119390 no

In [4]:
data['is_canceled'].value_counts()

0    75166
1    44224
Name: is_canceled, dtype: int64

Noted that if we use majority-class method, the accuracy will be about 0.62. Therefore, the implemented method should be better than this. 

In [5]:
features = ['hotel',
            'is_repeated_guest',
#             'reserved_room_type',
#             'assigned_room_type',
            'reserved_assigned_room_type_match',
            'country_is_FRA',
            'country_is_DEU',
            'country_is_other',
            'spring',
            'summer',
            'fall',
            'winter',
            'distribution_channel_Corporate',
            'distribution_channel_Direct',
            'distribution_channel_GDS',
            'distribution_channel_TA/TO',
            'distribution_channel_Undefined',
            'market_segment_Aviation',
            'market_segment_Complementary',
            'market_segment_Corporate',
            'market_segment_Direct',
            'market_segment_Groups',
            'market_segment_Offline TA/TO',
            'market_segment_Online TA',
            'market_segment_Undefined',
            'meal_BB',
            'meal_FB',
            'meal_HB',
            'meal_SC',
            'meal_Undefined',
            'deposit_type_No Deposit',
            'deposit_type_Non Refund',
            'customer_type_Contract',
            'customer_type_Group',
            'customer_type_Transient',
            'customer_type_Transient-Party',
            'agent_14',
            'agent_7',
            'agent_is_other',
            'log_lead_minmax',
#             'cut_previous_cancellations',
#             'cut_previous_bookings_not_canceled',
#             'cut_booking_changes',
            'stays_in_weekend_nights_minmax',
            'stays_in_week_nights_minmax',
            'stays_in_total_nights_minmax',
            'adults_minmax',
            'children_minmax',
            'babies_minmax',
            'days_in_waiting_list_minmax',
            'adr_minmax',
            'required_car_parking_spaces_minmax',
            'total_of_special_requests_minmax'
           ]

label = ['is_canceled']

In [6]:
len(features)

48

In [7]:
X = data[features]
Y = data[label]

In [11]:
model = DecisionTreeClassifier()
model.fit(X,Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [12]:
model.get_depth()

51

In [13]:
model.get_n_leaves()

18407

In [20]:
np.random.seed(0)
model = DecisionTreeClassifier()
score_cv = cross_val_score(model, X, Y, cv=10)

In [21]:
score_cv

array([0.70636516, 0.67035176, 0.63994975, 0.60896147, 0.56612782,
       0.63204624, 0.60370246, 0.57162004, 0.54908695, 0.60546155])

In [22]:
score_cv.mean()

0.6153673192299143

In [29]:
# to try different hyper-parameters
criterion = ['gini', 'entropy']
depths = [10,20,30,40,50]
max_features = np.arange(1,49,15)
num_leafs = [1000,5000,10000,15000,20000]


try_grid = [{'criterion': criterion,
             'max_depth': depths, 
             'max_features': max_features,
             'max_leaf_nodes':num_leafs}]

np.random.seed(0)
DTM = GridSearchCV(DecisionTreeClassifier(), param_grid=try_grid, cv=10)
DTM.fit(X,Y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_depth': [10, 20, 30, 40, 50],
              

In [30]:
DTM.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 16,
 'max_leaf_nodes': 5000}

In [31]:
DTM.best_score_

0.767501465784404

In [32]:
criterion = ['entropy']
depths = [5,10,15,20]
max_features = np.arange(1,20,3)
num_leafs = [5000,6000,7000,8000,9000,10000]


try_grid = [{'criterion': criterion,
             'max_depth': depths, 
             'max_features': max_features,
             'max_leaf_nodes':num_leafs}]
np.random.seed(0)
DTM = GridSearchCV(DecisionTreeClassifier(), param_grid=try_grid, cv=10)
DTM.fit(X,Y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'criterion': ['entropy'],
                          'max_depth': [5, 10, 15, 20],
                          '

In [33]:
DTM.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 10,
 'max_leaf_nodes': 6000}

In [34]:
DTM.best_score_

0.7668984002010218

In [8]:
criterion = ['entropy']
depths = np.arange(7,13,2)
max_features = np.arange(5,15,2)
num_leafs = [5000,5500,6000,6500,7000]


try_grid = [{'criterion': criterion,
             'max_depth': depths, 
             'max_features': max_features,
             'max_leaf_nodes':num_leafs}]
np.random.seed(0)
DTM = GridSearchCV(DecisionTreeClassifier(), param_grid=try_grid, cv=10)
DTM.fit(X,Y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'criterion': ['entropy'],
                          'max_depth': array([ 7,  9, 11]),
                       

In [9]:
DTM.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 13,
 'max_leaf_nodes': 6000}

In [10]:
DTM.best_score_

0.7679788927045816

In [11]:
criterion = ['entropy']
depths = np.arange(6,9)
max_features = np.arange(12,15)
num_leafs = [5500,5600,5900,6000,6100,6400,6500]


try_grid = [{'criterion': criterion,
             'max_depth': depths, 
             'max_features': max_features,
             'max_leaf_nodes':num_leafs}]
np.random.seed(0)
DTM = GridSearchCV(DecisionTreeClassifier(), param_grid=try_grid, cv=10)
DTM.fit(X,Y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'criterion': ['entropy'],
                          'max_depth': array([6, 7, 8]),
                          

In [12]:
DTM.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 13,
 'max_leaf_nodes': 5600}

In [13]:
DTM.best_score_

0.7837339810704415

In [17]:
criterion = ['entropy']
depths = [8]
max_features = [13]
num_leafs = np.arange(5500,5900,20)


try_grid = [{'criterion': criterion,
             'max_depth': depths, 
             'max_features': max_features,
             'max_leaf_nodes':num_leafs}]
np.random.seed(0)
DTM = GridSearchCV(DecisionTreeClassifier(), param_grid=try_grid, cv=10)
DTM.fit(X,Y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'criterion': ['entropy'], 'max_depth': [8],
                          'max_features': [13],
                 

In [18]:
DTM.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 13,
 'max_leaf_nodes': 5780}

In [19]:
DTM.best_score_

0.7818577770332523