In [15]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, RandomizedSearchCV
import joblib
import datautil
from sklearn.metrics import classification_report
from pprint import pprint

In [4]:
full_data = pd.read_csv('../train.csv')
full_data = full_data[(full_data['adr'] < 1000) & (full_data['adr'] > -100)] # remove outliers

# get the preprocessor and the default training features
preprocessor, features_spec = datautil.get_the_data_preprocessor()

# split data into input and labeled
X_train_full_raw = full_data[features_spec]
y_train_full = np.array(full_data['is_canceled'])
X_transformed = preprocessor.fit_transform(X_train_full_raw)

In [7]:
# build the random forest classifier
rf_cls = RandomForestClassifier(random_state=42, n_jobs=-1)
kfolds = 5 # 5-cross validation
split = KFold(kfolds, shuffle=True, random_state=42)

cv_results = cross_val_score(rf_cls, X_transformed, y_train_full, 
                            cv=split, scoring="accuracy")
print("Mean accuracy is: " + str(np.mean(cv_results)))

Mean accuracy is: 0.8764477157641594


In [11]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf_cls.get_params())

# parameter meanings:
# n_estimators = number of trees in the foreset
# max_features = max number of features considered for splitting a node
# max_depth = max number of levels in each decision tree
# min_samples_split = min number of data points placed in a node before the node is split
# min_samples_leaf = min number of data points allowed in a leaf node
# bootstrap = method for sampling data points (with or without replacement)

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [12]:
# Create random parameter grid

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
rf = RandomForestClassifier(verbose=1)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_transformed, y_train_full)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
# tune model 
# rf_parameters = {"max_depth": [10,13],
#                  "n_estimators": [10,100,500],
#                  "min_samples_split": [2,5]}
# rf_model = RandomForestClassifier()
# rf_cv_model = GridSearchCV(rf_model,
#                            rf_parameters,
#                            cv = 10,
#                            n_jobs = -1,
#                            verbose = 2)

# rf_cv_model.fit(X_transformed, y_train_full)
# print('Best parameters: ' + str(rf_cv_model.best_params_))

In [None]:
rf_tuned = RandomForestClassifier(max_depth = 13,
                                  min_samples_split = 5,
                                  n_estimators = 500)

print('Model: Random Forest Tuned\n')
# get cross validation score for the model
cv_results = cross_val_score(rf_tuned, X_transformed, y_train_full, 
                            cv=split, scoring="accuracy")
min_score = round(min(cv_results), 4) # round to 4 decimal precision
max_score = round(max(cv_results), 4)
mean_score = round(np.mean(cv_results), 4)
std_dev = round(np.std(cv_results), 4)
print(f"cross validation accuracy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")


# rf_cls.fit(X_transformed, y_train_full)

# print("RF", classification_report(y_test, predict_rf))
# joblib.dump(rf_cls, "rf_cls.model")