# 1. Data Inspectation

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

Acknowledgements

The data is originally from the article Hotel Booking Demand Datasets, written by Nuno Antonio, Ana Almeida, and Luis Nunes for Data in Brief, Volume 22, February 2019.https://www.sciencedirect.com/science/article/pii/S2352340918315191

The data was downloaded and cleaned by Thomas Mock and Antoine Bichat for #TidyTuesday during the week of February 11th, 2020.https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-11/readme.md

raw data: https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv

Task: cluster cancel and non- cancel  

Reference:
https://www.kaggle.com/jessemostipak/hotel-booking-demand/notebooks


In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv')
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [5]:
nulls = df.isnull().sum()
nulls[nulls > 0]
#currently only 4 columns has missing values

children         4
country        488
agent        16340
company     112593
dtype: int64

In [6]:
df = df.replace('Undefined', np.NaN)
# there are few coloumns hace Undefined value instead of NaN,
#replace 'Undefined' to NaN in the dataframe

In [7]:
#calculate the total missing values across the whole dataset
df.isnull().sum().sum()/(len(df.index)*31)
# only have 3.5% missing value, so need to randmomly replace with missing value

0.03528717215739147

# 2. Data Preprocessing for missing values

#### (1) Drop unhelpful columns & rows

In [8]:
percentage = df.isnull().sum()/ len(df)
percentage.sort_values(ascending=False).head()

company                 0.943069
agent                   0.136862
meal                    0.009791
country                 0.004087
distribution_channel    0.000042
dtype: float64

In [9]:
#company has 94.4% missing values, not helpful, drop the column
df.drop(['company'], axis=1, inplace=True)

In [10]:
# reservation_status_date contain a lot of variety 
df.drop(['reservation_status_date'], axis=1, inplace=True)  # objects & 926 varieties 
# reservation_status includes 'Canceled' feature 
#By keeping reservation_status in data,
# it is possible to achieve 100% accuracy rate because that feature is direct way to predict cancellations
# so, drop the reservation_status coumns
df.drop(['reservation_status'], axis=1, inplace=True)


In [11]:
df = df.dropna()

#### (2) inspect all the columns unqiue values for replacing the missing values

In [12]:
#transform column to binary value
df['hotel'] = df['hotel'].map({'Resort Hotel':0, 'City Hotel':1}).astype(int)

df['arrival_date_month'] = df['arrival_date_month'].map({'January':1, 'February': 2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7,
                                                            'August':8, 'September':9, 'October':10, 'November':11, 'December':12}).astype(int)

In [13]:
# Since country colomn has high varity data,so need to transfer to numerical data
# transfer to catergorical data first, then transfer to numeric data
df['country'] = df['country'].astype('category')
df['country'] = df['country'].cat.codes

In [14]:
#inspect data again
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102064 entries, 3 to 119389
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           102064 non-null  int32  
 1   is_canceled                     102064 non-null  int64  
 2   lead_time                       102064 non-null  int64  
 3   arrival_date_year               102064 non-null  int64  
 4   arrival_date_month              102064 non-null  int32  
 5   arrival_date_week_number        102064 non-null  int64  
 6   arrival_date_day_of_month       102064 non-null  int64  
 7   stays_in_weekend_nights         102064 non-null  int64  
 8   stays_in_week_nights            102064 non-null  int64  
 9   adults                          102064 non-null  int64  
 10  children                        102064 non-null  float64
 11  babies                          102064 non-null  int64  
 12  meal            

In [15]:
# create new colomns: 'is_family' , 'deposit_given', 'total_nights'
def family_check(df):
    if ((df['adults'] > 0) & (df['children'] > 0)):
        val = 1
    elif ((df['adults'] > 0) & (df['babies'] > 0)):
        val = 1
    else:
        val = 0
    return val

def deposit(df):
    if ((df['deposit_type'] == 'No Deposit') | (df['deposit_type'] == 'Refundable')):
        return 0
    else:
        return 1
    
def previous_cancellations_check(df):
    if df['previous_cancellations'] == 0:
        return 0
    else:
        return 1
    
def previous_bookings_not_canceled_check(df):
    if df['previous_bookings_not_canceled'] == 0:
        return 1
    else:
        return 0    
    
def booking_changed_check(df):
    if df['booking_changes'] == 0:
        return 0
    else:
        return 1    
    
def feature(df):
    # create new column 'is_family' base on 'adults', 'children', 'babies'
    df['is_family'] = df.apply(family_check, axis = 1)
    # create new column 'deposit_given' base on 'deposit_type'
    df['deposit_given'] = df.apply(deposit, axis=1)
    df['previous_cancelled'] = df.apply(previous_cancellations_check, axis=1)
    df['previous_bookings_not_canceled_check'] = df.apply(previous_bookings_not_canceled_check, axis=1)
    df['booking_changed'] = df.apply(booking_changed_check, axis=1)
    # create new column 'total_nights' base on 'stays_in_weekend_nights' and 'stays_in_week_nights'
    df['total_nights'] = df['stays_in_weekend_nights']+ df['stays_in_week_nights']
    df['booking_times'] = df['previous_cancellations'] + df['previous_bookings_not_canceled']
    return df

df = feature(df)

In [16]:
# since we create 'deposit_given' column from 'deposit_type', so can drop 'deposit_type'
df.drop(['deposit_type'], axis=1, inplace=True)
df.drop(['previous_cancellations'], axis=1, inplace=True)
df.drop(['previous_bookings_not_canceled'], axis=1, inplace=True)
df.drop(['booking_changes'], axis=1, inplace=True)

In [17]:
# check the correlation in the dataset with "is_canceled"
corr_matrix = df.corr()
corr_matrix['is_canceled'].sort_values(ascending=False)
# can see 'arrival_date_month','arrival_date_week_number','arrival_date_year',
#          'children','stays_in_week_nights','arrival_date_day_of_month','total_nights'
# have very low correlation(< 0.01%) with 'is_cancelled', whcih is not significant, 
# so it a good idea to drop these columns for further prediction

is_canceled                             1.000000
deposit_given                           0.460740
country                                 0.295191
previous_cancelled                      0.290006
lead_time                               0.278742
hotel                                   0.114751
booking_times                           0.077117
previous_bookings_not_canceled_check    0.074994
days_in_waiting_list                    0.056020
adults                                  0.036479
adr                                     0.023393
arrival_date_month                      0.008866
stays_in_week_nights                    0.006806
arrival_date_week_number                0.006469
children                                0.005147
arrival_date_year                       0.004027
total_nights                           -0.003762
arrival_date_day_of_month              -0.006036
is_family                              -0.012439
stays_in_weekend_nights                -0.022452
is_repeated_guest   

In [18]:
# drop the colomns with low correlation(< 0.01%) 

df.drop(['arrival_date_month','arrival_date_week_number',
          'stays_in_week_nights','children','arrival_date_year','total_nights',
        'arrival_date_day_of_month'], axis=1, inplace=True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102064 entries, 3 to 119389
Data columns (total 25 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   hotel                                 102064 non-null  int32  
 1   is_canceled                           102064 non-null  int64  
 2   lead_time                             102064 non-null  int64  
 3   stays_in_weekend_nights               102064 non-null  int64  
 4   adults                                102064 non-null  int64  
 5   babies                                102064 non-null  int64  
 6   meal                                  102064 non-null  object 
 7   country                               102064 non-null  int16  
 8   market_segment                        102064 non-null  object 
 9   distribution_channel                  102064 non-null  object 
 10  is_repeated_guest                     102064 non-null  int64  
 11  

In [20]:
df = pd.get_dummies(data = df, columns = ['meal', 'market_segment', 'distribution_channel',
                                            'reserved_room_type', 'assigned_room_type', 'customer_type'])

In [21]:
#inspect data
df.shape
df.describe()

(102064, 56)

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,adults,babies,country,is_repeated_guest,agent,days_in_waiting_list,...,assigned_room_type_E,assigned_room_type_F,assigned_room_type_G,assigned_room_type_H,assigned_room_type_I,assigned_room_type_K,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
count,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,...,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0,102064.0
mean,0.697415,0.391274,111.966384,0.976466,1.899779,0.007623,89.347439,0.013893,85.420031,2.567732,...,0.061442,0.03052,0.021153,0.005888,0.002665,0.002185,0.039661,0.004625,0.758661,0.197053
std,0.459379,0.488038,107.910821,1.002527,0.491642,0.096786,44.172725,0.117049,109.527488,18.676237,...,0.24014,0.172014,0.143896,0.07651,0.051555,0.046692,0.195163,0.067847,0.427897,0.397775
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,26.0,0.0,2.0,0.0,51.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,1.0,0.0,79.0,1.0,2.0,0.0,80.0,0.0,14.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,1.0,169.0,2.0,2.0,0.0,134.0,0.0,229.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,629.0,16.0,26.0,10.0,173.0,1.0,535.0,391.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### (3). Data Preprocessing for ML

In [22]:
# feature set and targer set
X = df.drop('is_canceled', axis = 1)
y = df['is_canceled']

In [23]:
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

#### (4) Data scaling

##### Choosing scaling
##### use MixMax scaling in order to normalize the data set. The data in our data set are spread across a wide range of values, which might result in various features affecting the final result more than the other feature. MixMax scaling reduces this effect by re-scaling the data to a specificed range of values, in this case 0-1

In [24]:
#data scalering - StandardScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

MinMaxScaler()

# 3. Classification

####  Evaluation strategy: The training set score and test set scores are close, the highest test set score among all models is better

###  (1) KNN Classification

In [25]:
#GridsearchCV and cross validation searching for KNN hypterparameter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(1, 11)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5,return_train_score = False,n_jobs = -1)# return_train_score = True,
knn_cv.fit(X_train, y_train)
knn_cv.best_params_

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])})

{'n_neighbors': 6}

In [26]:
import pandas as pd
knn_cv_result = pd.DataFrame(knn_cv.cv_results_)
knn_cv_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,45.729095,1.471985,65.71159,1.823796,1,{'n_neighbors': 1},0.819268,0.819987,0.822142,0.817232,0.825985,0.820923,0.002978,10
1,37.084408,10.87477,79.707524,4.220398,2,{'n_neighbors': 2},0.824233,0.823449,0.824951,0.821216,0.825723,0.823914,0.001546,9
2,21.103745,2.201023,86.993508,4.543116,3,{'n_neighbors': 3},0.825343,0.820575,0.825278,0.823503,0.828075,0.824555,0.002469,8
3,20.554067,3.774008,89.388009,3.808561,4,{'n_neighbors': 4},0.826519,0.826061,0.828739,0.827879,0.830165,0.827873,0.001491,4
4,19.620534,2.175962,96.387252,3.175892,5,{'n_neighbors': 5},0.8258,0.823187,0.82678,0.825462,0.830949,0.826436,0.002545,7


In [28]:
#k=6 is the best hyperparameter, applied this value in the model
clf = KNeighborsClassifier(n_neighbors=6,n_jobs = -1)
clf.fit(X_train, y_train)
print("Training set score: {:.3f}".format(clf.score(X_train, y_train)))
print("Test set accuracy: {:.3f}".format(clf.score(X_test, y_test)))
print("Best parameters: {}".format(knn_cv.best_params_))
print("Best cross-validation score: {:.4f}".format(knn_cv.best_score_))

KNeighborsClassifier(n_jobs=-1, n_neighbors=6)

Training set score: 0.873
Test set accuracy: 0.835
Best parameters: {'n_neighbors': 6}
Best cross-validation score: 0.8286


### (2) Logistic Regression

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":[0.001, 0.01, 0.1, 1, 10, 100, 1000], "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression(solver='liblinear')
logreg_cv=GridSearchCV(logreg,grid,cv=5,return_train_score = False ,n_jobs = -1)
logreg_cv.fit(X_train,y_train)
logreg_cv.best_params_

GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2']})

{'C': 10, 'penalty': 'l1'}

In [30]:
import pandas as pd
logreg_cv_result = pd.DataFrame(logreg_cv.cv_results_)
logreg_cv_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.535749,0.014225,0.034325,0.01229,0.001,l1,"{'C': 0.001, 'penalty': 'l1'}",0.728021,0.729066,0.732528,0.731269,0.72637,0.729451,0.002212,14
1,0.692567,0.061816,0.016976,0.007359,0.001,l2,"{'C': 0.001, 'penalty': 'l2'}",0.750033,0.753886,0.751927,0.753152,0.748841,0.751568,0.001887,13
2,5.65756,0.459834,0.011138,0.001304,0.01,l1,"{'C': 0.01, 'penalty': 'l1'}",0.787459,0.788178,0.781385,0.787119,0.788033,0.786435,0.002554,11
3,1.347759,0.136318,0.011211,0.001195,0.01,l2,"{'C': 0.01, 'penalty': 'l2'}",0.782299,0.781973,0.777792,0.78026,0.781305,0.780726,0.001624,12
4,105.054565,7.106201,0.012378,0.000835,0.1,l1,"{'C': 0.1, 'penalty': 'l1'}",0.792946,0.792423,0.787198,0.792214,0.79352,0.79166,0.002277,9


In [31]:
#use best parameter C value 
logreg1 = LogisticRegression(C=10, penalty = 'l1',solver='liblinear',random_state=0).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg1.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg1.score(X_test, y_test)))
print("Best parameters: {}".format(logreg_cv.best_params_))
print("Best cross-validation score: {:.4f}".format(logreg_cv.best_score_))

Training set score: 0.794
Test set score: 0.796
Best parameters: {'C': 10, 'penalty': 'l1'}
Best cross-validation score: 0.7935


### (3) Linear Support Vector Machine

In [32]:
#select best hyperparameter #slow
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
Cs = [0.01, 0.1, 1, 10, 100]
param_grid = {'C': Cs}
linearSVC = GridSearchCV(LinearSVC(max_iter=500000), param_grid, cv=5,return_train_score = False,n_jobs = -1)
linearSVC.fit(X_train, y_train)
linearSVC.best_params_

GridSearchCV(cv=5, estimator=LinearSVC(max_iter=500000), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100]})

{'C': 10}

In [33]:
import pandas as pd
linearSVC_result = pd.DataFrame(linearSVC.cv_results_)
linearSVC_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.88562,0.019952,0.013164,0.003052,0.01,{'C': 0.01},0.789745,0.79079,0.784847,0.788817,0.789405,0.788721,0.002041,5
1,1.731147,0.110965,0.018551,0.011994,0.1,{'C': 0.1},0.793076,0.79275,0.787133,0.790319,0.793194,0.791294,0.002331,4
2,11.966485,0.362919,0.011382,0.001602,1.0,{'C': 1},0.796277,0.794187,0.788047,0.792475,0.794239,0.793045,0.002774,3
3,124.208038,21.098415,0.012367,0.000798,10.0,{'C': 10},0.796081,0.794252,0.788439,0.792998,0.794173,0.793189,0.002571,1
4,709.050435,52.255466,0.008021,0.000873,100.0,{'C': 100},0.796081,0.794252,0.788243,0.793194,0.794108,0.793176,0.002638,2


In [34]:
#use best parameter C value 
svm = LinearSVC(C=10).fit(X_train, y_train)
print("Training set score: {:.3f}".format(svm.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svm.score(X_test, y_test)))
print("Best parameters: {}".format(linearSVC.best_params_))
print("Best cross-validation score: {:.4f}".format(linearSVC.best_score_))

Training set score: 0.794
Test set score: 0.795
Best parameters: {'C': 10}
Best cross-validation score: 0.7932




### (4) Kerenilzed Support Vector Machine (rbf, poly, and linear)

In [35]:
#linear hyperparameter selection 
from sklearn.svm import SVC
Cs = [0.01, 0.1, 1, 10, 100]
param_grid = {'C': Cs}
kerenl_lin = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5,return_train_score = False,n_jobs = -1)
kerenl_lin.fit(X_train, y_train)
print("The best classifier is: ", kerenl_lin.best_params_)

GridSearchCV(cv=5, estimator=SVC(kernel='linear'), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100]})

The best classifier is:  {'C': 100}


In [36]:
import pandas as pd
kerenl_lin_result = pd.DataFrame(kerenl_lin.cv_results_)
kerenl_lin_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,699.136514,4.806039,92.877986,0.266528,0.01,{'C': 0.01},0.752449,0.758981,0.763423,0.760468,0.751715,0.757407,0.004583,5
1,687.927402,3.531994,87.801916,1.849335,0.1,{'C': 0.1},0.793468,0.793534,0.785957,0.792606,0.791169,0.791347,0.002827,4
2,722.851105,8.641951,86.146349,1.118319,1.0,{'C': 1},0.796799,0.797779,0.790202,0.795153,0.796198,0.795227,0.002653,3
3,926.858829,23.20916,83.706962,1.819546,10.0,{'C': 10},0.798563,0.798302,0.789941,0.79659,0.797047,0.796089,0.003162,2
4,2109.675299,249.765292,39.952739,17.808542,100.0,{'C': 100},0.798628,0.797779,0.789615,0.797505,0.797374,0.79618,0.003312,1


In [37]:
#use best parameter C value 
svc = SVC(kernel='linear', C=100,gamma='auto').fit(X_train, y_train)
print("Training set score: {:.3f}".format(svc.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svc.score(X_test, y_test)))
print("Best parameters: {}".format(kerenl_lin.best_params_))
print("Best cross-validation score: {:.4f}".format(kerenl_lin.best_score_))

Training set score: 0.797
Test set score: 0.799
Best parameters: {'C': 100}
Best cross-validation score: 0.7962


In [38]:
#rbf hypterparameter selection 
C_range = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'C': C_range}
kernel_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5,return_train_score = False, n_jobs = -1)
kernel_rbf.fit(X_train, y_train)
print("The best classifier is: ", kernel_rbf.best_estimator_)

GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})

The best classifier is:  SVC(C=100)


In [39]:
import pandas as pd
kernel_rbf_result = pd.DataFrame(kernel_rbf.cv_results_)
kernel_rbf_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,807.156844,2.290566,115.831139,0.434314,0.001,{'C': 0.001},0.724037,0.725343,0.729262,0.727807,0.723169,0.725924,0.002288,6
1,587.144486,8.838613,87.616417,1.601845,0.01,{'C': 0.01},0.745983,0.747028,0.748792,0.747861,0.744333,0.746799,0.001542,5
2,527.745764,3.193816,78.666359,0.822243,0.1,{'C': 0.1},0.803788,0.802809,0.796669,0.8026,0.802992,0.801771,0.002583,4
3,491.562226,2.976957,69.599614,0.683921,1.0,{'C': 1},0.821097,0.817831,0.811104,0.81795,0.821543,0.817905,0.003733,3
4,528.628662,25.372365,62.389501,5.066527,10.0,{'C': 10},0.818158,0.820444,0.813651,0.822523,0.823568,0.819669,0.003534,2


In [40]:
#use best parameter C value 
svc = SVC(kernel='rbf', C=100,gamma='auto').fit(X_train, y_train)
print("Training set score: {:.3f}".format(svc.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svc.score(X_test, y_test)))
print("Best parameters: {}".format(kernel_rbf.best_params_))
print("Best cross-validation score: {:.4f}".format(kernel_rbf.best_score_))

Training set score: 0.822
Test set score: 0.825
Best parameters: {'C': 100}
Best cross-validation score: 0.8232


In [25]:
#poly hyperparameter selection 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

Cs = [0.1, 1, 10, 100]
param_grid = {'C': Cs}
kernel_poly = GridSearchCV(SVC(kernel='poly'), param_grid, cv=3,return_train_score = False,n_jobs = -1)
kernel_poly.fit(X_train, y_train)
print("The best classifier is: ", kernel_poly.best_params_)

GridSearchCV(cv=3, estimator=SVC(kernel='poly'), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100]})

The best classifier is:  {'C': 100}


In [26]:
import pandas as pd
kernel_poly_result = pd.DataFrame(kernel_poly.cv_results_)
kernel_poly_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,708.982276,2.105757,214.644681,0.306265,0.1,{'C': 0.1},0.808434,0.805181,0.808316,0.80731,0.001506,4
1,689.117807,1.662509,196.391845,0.384779,1.0,{'C': 1},0.820113,0.815253,0.818428,0.817931,0.002015,3
2,673.007887,161.500016,150.461973,31.117068,10.0,{'C': 10},0.818898,0.819329,0.821014,0.819747,0.000913,2
3,753.345833,17.424017,78.119463,3.146025,100.0,{'C': 100},0.820622,0.822543,0.825169,0.822778,0.001863,1


In [27]:
#use best parameter C value 
svc = SVC(kernel = 'poly',C=100,gamma='auto').fit(X_train, y_train)
print("Training set score: {:.3f}".format(svc.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svc.score(X_test, y_test)))
print("Best parameters: {}".format(kernel_poly.best_params_))
print("Best cross-validation score: {:.4f}".format(kernel_poly.best_score_))

Training set score: 0.815
Test set score: 0.817
Best parameters: {'C': 100}
Best cross-validation score: 0.8228


### (5) Decision Tree

In [28]:
#decision tree hyperparameter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}
clf_tree=DecisionTreeClassifier()
grid_search=GridSearchCV(clf_tree,parameters, cv=10,return_train_score = False,n_jobs = -1)
grid_search.fit(X_train, y_train)
print("The best classifier is: ", grid_search.best_params_)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': range(1, 20, 2),
                         'min_samples_split': range(10, 500, 20)})

The best classifier is:  {'max_depth': 19, 'min_samples_split': 10}


In [29]:
import pandas as pd
grid_search_result = pd.DataFrame(grid_search.cv_results_)
grid_search_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.675826,0.079823,0.013544,0.015732,1,10,"{'max_depth': 1, 'min_samples_split': 10}",0.725016,0.731025,0.728543,0.729589,0.734814,0.730242,0.731809,0.730764,0.730598,0.722106,0.729451,0.00339,226
1,0.762661,0.020443,0.006786,0.000989,1,30,"{'max_depth': 1, 'min_samples_split': 30}",0.725016,0.731025,0.728543,0.729589,0.734814,0.730242,0.731809,0.730764,0.730598,0.722106,0.729451,0.00339,226
2,0.800345,0.024738,0.017325,0.020746,1,50,"{'max_depth': 1, 'min_samples_split': 50}",0.725016,0.731025,0.728543,0.729589,0.734814,0.730242,0.731809,0.730764,0.730598,0.722106,0.729451,0.00339,226
3,0.862335,0.045506,0.025116,0.022411,1,70,"{'max_depth': 1, 'min_samples_split': 70}",0.725016,0.731025,0.728543,0.729589,0.734814,0.730242,0.731809,0.730764,0.730598,0.722106,0.729451,0.00339,226
4,0.911642,0.049899,0.011957,0.012152,1,90,"{'max_depth': 1, 'min_samples_split': 90}",0.725016,0.731025,0.728543,0.729589,0.734814,0.730242,0.731809,0.730764,0.730598,0.722106,0.729451,0.00339,226


In [30]:
#use best parameters values 
clf_tree=DecisionTreeClassifier(max_depth=19,min_samples_split=10).fit(X_train, y_train)
print("Training clf_treeset score: {:.3f}".format(clf_tree.score(X_train, y_train)))
print("Test set score: {:.3f}".format(clf_tree.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

Training clf_treeset score: 0.888
Test set score: 0.849
Best parameters: {'max_depth': 19, 'min_samples_split': 10}
Best cross-validation score: 0.8478


### (6) Random Forest

In [31]:
# random forest hyperparameter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}
clf_treeR=RandomForestClassifier()
grid_searchR=GridSearchCV(clf_treeR,parameters, cv=10,return_train_score = False,n_jobs = -1)
grid_searchR.fit(X_train, y_train)
print("The best classifier is: ", grid_searchR.best_params_)

GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': range(1, 20, 2),
                         'min_samples_split': range(10, 500, 20)})

The best classifier is:  {'max_depth': 19, 'min_samples_split': 10}


In [32]:
import pandas as pd
grid_searchR_result = pd.DataFrame(grid_searchR.cv_results_)
grid_searchR_result.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,10.055062,0.273137,0.306936,0.006679,1,10,"{'max_depth': 1, 'min_samples_split': 10}",0.681123,0.707381,0.736512,0.68452,0.708687,0.744219,0.743566,0.707642,0.74092,0.732036,0.718661,0.02278,246
1,10.546108,0.285324,0.310499,0.015224,1,30,"{'max_depth': 1, 'min_samples_split': 30}",0.739909,0.743436,0.678772,0.737296,0.680601,0.705944,0.685434,0.736381,0.736478,0.728508,0.717276,0.025352,248
2,10.322576,0.314747,0.31383,0.010825,1,50,"{'max_depth': 1, 'min_samples_split': 50}",0.676813,0.736904,0.739778,0.706336,0.743828,0.743697,0.682822,0.73821,0.74301,0.729031,0.724043,0.024535,237
3,10.244458,0.265394,0.329,0.02935,1,70,"{'max_depth': 1, 'min_samples_split': 70}",0.70725,0.741215,0.738472,0.686871,0.74226,0.681515,0.738994,0.742782,0.743663,0.693494,0.721652,0.024785,244
4,10.250307,0.325307,0.315948,0.022562,1,90,"{'max_depth': 1, 'min_samples_split': 90}",0.696799,0.739255,0.683344,0.73808,0.745395,0.738733,0.712998,0.738602,0.744578,0.727071,0.726486,0.020503,234


In [33]:
#use best parameters values 
clf_treeR=RandomForestClassifier(max_depth=19,min_samples_split=10).fit(X_train, y_train)
print("Training clf_treeset score: {:.3f}".format(clf_treeR.score(X_train, y_train)))
print("Test set score: {:.3f}".format(clf_treeR.score(X_test, y_test)))
print("Best parameters: {}".format(grid_searchR.best_params_))
print("Best cross-validation score: {:.4f}".format(grid_searchR.best_score_))

Training clf_treeset score: 0.881
Test set score: 0.864
Best parameters: {'max_depth': 19, 'min_samples_split': 10}
Best cross-validation score: 0.8617


# 4. Find the best model

##### 1. Knn: train 0.873, test 0.835, Best cross-validation score 0.8286

##### 2. Logistic Regression:  train  0.794,  test 0.796, Best cross-validation score 0.7935

##### 3. Linear Support Vector Machine : train 0.794, test 0.795, Best cross-validation score: 0.7932

##### 4.Kerenilzed Support Vector Machine (rbf, poly, and linear):   
##### (1) Linear train 0.797, test 0.799, Best cross-validation score: 0.7962 
##### (2) Rbf train 0.822, test 0.825, Best cross-validation score: 0.8232
##### (3) Poly train 0.815 test 0.817, Best cross-validation score: 0.8228

#### 5. decision tree: train 0.888  test 0.849, Best cross-validation score: 0.8478

#### 6. random forest: train 0.881  test 0.864, Best cross-validation score: 0.8617

### Random forest model has the highest training and test score. In addition, it also has the highest Best cross-validation score-- 0.8617. Therefore, Random forest model is the best option

In [34]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = grid_searchR.predict(X_test)
print('accuracy_score: ', accuracy_score(y_test, y_pred))
print('roc_auc_score: ', roc_auc_score(y_test, grid_searchR.predict_proba(X_test)[:,1]))

accuracy_score:  0.865221821602132
roc_auc_score:  0.942403916925321
