In [10]:
import pandas as pd
import numpy as np
import os,sys
sys.path.append(os.path.realpath('..'))
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, RandomizedSearchCV
import joblib
import datautil
from sklearn.metrics import classification_report, confusion_matrix
import pickle as pkl
from joblib import dump, load
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [6]:
full_data = pd.read_csv('../train.csv')
full_data = full_data[(full_data['adr'] < 1000) & (full_data['adr'] > -100)] # remove outliers

# get the preprocessor and the default training features
preprocessor, features_spec = datautil.get_the_data_preprocessor()

# split data into input and labeled
X_train_full_raw = full_data[features_spec]
y = np.array(full_data['is_canceled'])
X = preprocessor.fit_transform(X_train_full_raw)

In [7]:
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=66)

In [7]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

# predictions
rfc_predict = rfc.predict(X_test)

In [None]:
# score
rfc_cv_score = cross_val_score(rfc, X, y, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

In [None]:
# randomized search
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(100, 500, num = 11)] + [None]
}

gsearch1 = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = param_grid, verbose=3, scoring='roc_auc', cv=3, n_iter=100, random_state=42, n_jobs=-1)
gsearch1.fit(X,y)
print('best params')
print (gsearch1.best_params_)
print('best score')
print (gsearch1.best_score_)

# save grid search 
dump(gsearch1.best_estimator_, 'randomforest.model')

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [11]:
rfc = XGBClassifier()
rfc.fit(X_train,y_train)

# predictions
rfc_predict = rfc.predict(X_test)

# score
rfc_cv_score = cross_val_score(rfc, X, y, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - XGBoost: ", rfc_cv_score.mean())





[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.9s finished


=== Confusion Matrix ===
[[14692]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14692

    accuracy                           1.00     14692
   macro avg       1.00      1.00      1.00     14692
weighted avg       1.00      1.00      1.00     14692



=== All AUC Scores ===
[0.8776161  0.85318443 0.834735  ]


=== Mean AUC Score ===
Mean AUC Score - XGBoost:  0.8551785100446946


In [16]:
rfc = LGBMClassifier()
rfc.fit(X_train,y_train)

# predictions
rfc_predict = rfc.predict(X_test)

# score
rfc_cv_score = cross_val_score(rfc, X, y, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - LightGBM: ", rfc_cv_score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.4s finished


=== Confusion Matrix ===
[[14042   720]
 [ 2625  5494]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.84      0.95      0.89     14762
           1       0.88      0.68      0.77      8119

    accuracy                           0.85     22881
   macro avg       0.86      0.81      0.83     22881
weighted avg       0.86      0.85      0.85     22881



=== All AUC Scores ===
[0.88563735 0.86931078 0.84427889]


=== Mean AUC Score ===
Mean AUC Score - LightGBM:  0.8664090063696371


In [28]:
dump(rfc, 'lightgbm.model')

['lightgbm.model']

In [13]:
# lightgbm random search
rs_params = {'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
            }

# Initialize a RandomizedSearchCV object using 5-fold CV-
rs_cv = RandomizedSearchCV(estimator=LGBMClassifier(), param_distributions=rs_params, cv = 3, n_iter=200,verbose=3, n_jobs=-1)

# Train on training data-
rs_cv.fit(X_train,y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.4min finished


RandomizedSearchCV(cv=3, estimator=LGBMClassifier(), n_iter=200, n_jobs=-1,
                   param_distributions={'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001DC1A373520>,
                                        'min_child_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001DC1A373A60>,
                                        'min_child_weight': [1e-05, 0.001, 0.01,
                                                             0.1, 1, 10.0,
                                                             100.0, 1000.0,
                                                             10000.0],
                                        'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001DC1A11F0D0>,
                                        'reg_alpha': [0, 0.1, 1, 2, 5, 7, 10,
                                                      50, 100],
                                        'reg_lambda': [0, 0.1, 1, 5, 10, 20,

In [15]:
rfc = rs_cv.best_estimator_

# predictions
rfc_predict = rfc.predict(X_test)

# score
rfc_cv_score = cross_val_score(rfc, X, y, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Best LightGBM: ", rfc_cv_score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


=== Confusion Matrix ===
[[13952   810]
 [ 2499  5620]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.85      0.95      0.89     14762
           1       0.87      0.69      0.77      8119

    accuracy                           0.86     22881
   macro avg       0.86      0.82      0.83     22881
weighted avg       0.86      0.86      0.85     22881



=== All AUC Scores ===
[0.88449815 0.86348194 0.8391431 ]


=== Mean AUC Score ===
Mean AUC Score - Best LightGBM:  0.8623743986304655


In [None]:
dump(rfc, 'lightgbm.model')