In [21]:
import pandas as pd
import numpy as np
import os,sys
sys.path.append(os.path.realpath('..'))
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, RandomizedSearchCV
import joblib
import datautil
from sklearn.metrics import classification_report, confusion_matrix
import pickle as pkl
from joblib import dump, load


In [12]:
full_data = pd.read_csv('../train.csv')
full_data = full_data[(full_data['adr'] < 1000) & (full_data['adr'] > -100)] # remove outliers

# get the preprocessor and the default training features
preprocessor, features_spec = datautil.get_the_data_preprocessor()

# split data into input and labeled
X_train_full_raw = full_data[features_spec]
y = np.array(full_data['is_canceled'])
X = preprocessor.fit_transform(X_train_full_raw)

In [13]:
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=66)

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

# predictions
rfc_predict = rfc.predict(X_test)

In [16]:
# score
rfc_cv_score = cross_val_score(rfc, X, y, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   29.1s finished


=== Confusion Matrix ===
[[13776   986]
 [ 1852  6267]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     14762
           1       0.86      0.77      0.82      8119

    accuracy                           0.88     22881
   macro avg       0.87      0.85      0.86     22881
weighted avg       0.88      0.88      0.87     22881



=== All AUC Scores ===
[0.8652969  0.85333683 0.81767558]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.845436434437895


In [20]:
# randomized search
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(100, 500, num = 11)] + [None]
}

gsearch1 = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = param_grid, verbose=3, scoring='roc_auc', cv=3, n_iter=100, random_state=42, n_jobs=-1)
gsearch1.fit(X,y)
print('best params')
print (gsearch1.best_params_)
print('best score')
print (gsearch1.best_score_)

# save grid search 
dump(gsearch1.best_estimator_, 'randomforest.model')

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 59.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 142.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 152.5min finished


best params
{'n_estimators': 1600, 'max_features': 'sqrt', 'max_depth': 340}
best score
0.8488319845065413


NameError: name 'dump' is not defined

In [22]:
dump(gsearch1.best_estimator_, 'randomforest.model')

['randomforest.model']

In [26]:
from xgboost import XGBClassifier
rfc = XGBClassifier()
rfc.fit(X_train,y_train)

# predictions
rfc_predict = rfc.predict(X_test)

# score
rfc_cv_score = cross_val_score(rfc, X, y, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - XGBoost: ", rfc_cv_score.mean())





[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   12.5s finished


=== Confusion Matrix ===
[[13904   858]
 [ 2405  5714]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.85      0.94      0.89     14762
           1       0.87      0.70      0.78      8119

    accuracy                           0.86     22881
   macro avg       0.86      0.82      0.84     22881
weighted avg       0.86      0.86      0.85     22881



=== All AUC Scores ===
[0.8776161  0.85318443 0.834735  ]


=== Mean AUC Score ===
Mean AUC Score - XGBoost:  0.8551785100446946


In [27]:
from lightgbm import LGBMClassifier
rfc = LGBMClassifier()
rfc.fit(X_train,y_train)

# predictions
rfc_predict = rfc.predict(X_test)

# score
rfc_cv_score = cross_val_score(rfc, X, y, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - LightGBM: ", rfc_cv_score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.9s finished


=== Confusion Matrix ===
[[14042   720]
 [ 2625  5494]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.84      0.95      0.89     14762
           1       0.88      0.68      0.77      8119

    accuracy                           0.85     22881
   macro avg       0.86      0.81      0.83     22881
weighted avg       0.86      0.85      0.85     22881



=== All AUC Scores ===
[0.88563735 0.86931078 0.84427889]


=== Mean AUC Score ===
Mean AUC Score - LightGBM:  0.8664090063696371


In [28]:
dump(rfc, 'lightgbm.model')

['lightgbm.model']