In [8]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, GridSearchCV
import joblib
import datautil
from sklearn.metrics import classification_report
from pprint import pprint

In [2]:
full_data = pd.read_csv('../train.csv')
full_data = full_data[(full_data['adr'] < 1000) & (full_data['adr'] > -100)] # remove outliers

# get the preprocessor and the default training features
preprocessor, features_spec = datautil.get_the_data_preprocessor()

# split data into input and labeled
X_train_full_raw = full_data[features_spec]
y_train_full = np.array(full_data['is_canceled'])
X_transformed = preprocessor.fit_transform(X_train_full_raw)

In [5]:
# build the random forest classifier
rf_cls = RandomForestClassifier(random_state=42, n_jobs=1) # n_job = -1 will cause error on Linux workstation
kfolds = 5 # 5-cross validation
split = KFold(kfolds, shuffle=True, random_state=42)

cv_results = cross_val_score(rf_cls, X_transformed, y_train_full, 
                            cv=split, scoring="accuracy")
print("Mean accuracy is: " + str(np.mean(cv_results)))

Mean accuracy is: 0.8764477157641594


In [6]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf_cls.get_params())

# parameter meanings:
# n_estimators = number of trees in the foreset
# max_features = max number of features considered for splitting a node
# max_depth = max number of levels in each decision tree
# min_samples_split = min number of data points placed in a node before the node is split
# min_samples_leaf = min number of data points allowed in a leaf node
# bootstrap = method for sampling data points (with or without replacement)

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [9]:
# tune model 
rf_parameters = {"n_estimators": [100,500,1000],
                 "min_samples_split": [2,5]}
rf_model = RandomForestClassifier()
rf_cv_model = GridSearchCV(rf_model,
                           rf_parameters,
                           cv = 5,
                           n_jobs = 1,
                           verbose = 2)

rf_cv_model.fit(X_transformed, y_train_full)
print('Best parameters: ' + str(rf_cv_model.best_params_))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] min_samples_split=2, n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............ min_samples_split=2, n_estimators=100, total=  13.5s
[CV] min_samples_split=2, n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.5s remaining:    0.0s


[CV] ............ min_samples_split=2, n_estimators=100, total=  12.6s
[CV] min_samples_split=2, n_estimators=100 ...........................
[CV] ............ min_samples_split=2, n_estimators=100, total=  12.8s
[CV] min_samples_split=2, n_estimators=100 ...........................
[CV] ............ min_samples_split=2, n_estimators=100, total=  12.1s
[CV] min_samples_split=2, n_estimators=100 ...........................
[CV] ............ min_samples_split=2, n_estimators=100, total=  12.9s
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV] ............ min_samples_split=2, n_estimators=500, total= 1.0min
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV] ............ min_samples_split=2, n_estimators=500, total= 1.1min
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV] ............ min_samples_split=2, n_estimators=500, total= 1.0min
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 34.1min finished


Best parameters: {'min_samples_split': 5, 'n_estimators': 1000}


In [None]:
rf = RandomForestClassifier(verbose=1)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_transformed, y_train_full)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [10]:
rf_tuned = RandomForestClassifier(min_samples_split=5, n_estimators=1000)

print('Model: Random Forest Tuned\n')
# get cross validation score for the model
cv_results = cross_val_score(rf_tuned, X_transformed, y_train_full, 
                            cv=split, scoring="accuracy")
min_score = round(min(cv_results), 4) # round to 4 decimal precision
max_score = round(max(cv_results), 4)
mean_score = round(np.mean(cv_results), 4)
std_dev = round(np.std(cv_results), 4)
 

# rf_cls.fit(X_transformed, y_train_full)

# print("RF", classification_report(y_test, predict_rf))
# joblib.dump(rf_cls, "rf_cls.model")

Model: Random Forest Tuned

cross validation accuracy score: 0.8751 +/- 0.0013 (std) min: 0.8736, max: 0.8775
