In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate, KFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import pickle

In [2]:
data = pd.read_csv('Final_data.csv')

In [3]:
inputs = data.drop('is_canceled', axis=1)
outputs = data['is_canceled']

### Split Data

In [4]:
x_train, x_test, y_train, y_test = train_test_split(inputs, outputs, test_size = 0.10, random_state=42)

### Model Selection

In [5]:
models = [('XGB', XGBClassifier()), ('DT', DecisionTreeClassifier()),
         ('LGBM', LGBMClassifier()), ('RF', RandomForestClassifier())]

In [6]:
kfolds = 5
kfold_split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

In [7]:
for name, model in models:
    model_steps = Pipeline(steps=[('tsvd', TruncatedSVD(n_components=15)),
                                  ('model', model)])
    cv_result = cross_val_score(model_steps, x_train,y_train, cv=kfold_split, scoring='accuracy')
    score = round(np.mean(cv_result), 4)
    print(f'{name} cross validation accuracy score is : {score}')

XGB cross validation accuracy score is : 0.8391
DT cross validation accuracy score is : 0.8188
LGBM cross validation accuracy score is : 0.8285
RF cross validation accuracy score is : 0.8473


# Modeling
Random forest potperform of all models.

### Hyperparameter Tuning

In [12]:
grid_param = {
        'model__bootstrap' : [True, False],
        'model__max_depth' : [10, 20, 30, 40, None],
        'model__min_samples_leaf' : [1, 2, 4],
        'model__min_samples_split' : [2, 5, 10],
        'model__n_estimators' : [100, 200, 500, 1000]
    }

In [13]:
model_pipe = Pipeline(steps=[('tsvd', TruncatedSVD(n_components=15)),
                              ('model', RandomForestClassifier())])
random_search = RandomizedSearchCV(model_pipe, grid_param, cv=kfold_split, verbose=10, n_jobs=3)
best_model = random_search.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  6.4min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  9.7min
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed: 19.6min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed: 24.6min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed: 29.5min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 36.9min
[Parallel(n_jobs=3)]: Done  50 out of  50 | elapsed: 39.9min finished


In [14]:
best_model.best_estimator_

Pipeline(memory=None,
         steps=[('tsvd',
                 TruncatedSVD(algorithm='randomized', n_components=15, n_iter=5,
                              random_state=None, tol=0.0)),
                ('model',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=20, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1,
                                        min_samples_split=10,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
             

In [15]:
best_model.best_params_

{'model__n_estimators': 100,
 'model__min_samples_split': 10,
 'model__min_samples_leaf': 1,
 'model__max_depth': 20,
 'model__bootstrap': True}

In [16]:
best_model.best_score_

0.8519604408082866

### Final Model

In [17]:
final_model = Pipeline(steps=[('tsvd', TruncatedSVD(n_components=15)),
                              ('model', RandomForestClassifier(n_estimators=100, 
                                                               min_samples_split=10, 
                                                              min_samples_leaf=1,
                                                              max_depth=20,
                                                              bootstrap=True))])

In [18]:
final_model.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tsvd',
                 TruncatedSVD(algorithm='randomized', n_components=15, n_iter=5,
                              random_state=None, tol=0.0)),
                ('model',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=20, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1,
                                        min_samples_split=10,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
             

### Predictions

In [20]:
y_pred = final_model.predict(x_test)

In [22]:
print("Accuracy on Test Data:", accuracy_score(y_test, y_pred))

Accuracy on Test Data: 0.8591171789932155


### Saving pickel files

In [24]:
# saving model
pickle.dump(final_model, open('booking_hotel_model.pkl', 'wb'))