## Random Forest
This notebook fits a Random Forest model to the Scania Trucks Air Pressure System (APS) predictive maintenance dataset, obtained from [UCI's data repository](https://archive.ics.uci.edu/ml/datasets/APS+Failure+at+Scania+Trucks). 

In the Logistic Regression & Naive Bayes notebooks, grid searches were used for hyperparameter tuning. However, significantly more computational resources are required to fit a Random Forest model. Thus, this notebook will use Bayesian Optimization in lieu of grid search.

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from hyperopt import fmin, tpe, hp
from sklearn.impute import SimpleImputer
from sklearn.metrics import auc, roc_curve, precision_recall_curve, make_scorer, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

sns.set()

In [2]:
df_train = pd.read_csv(r'./data/aps_failure_training_set_data_only.csv')

In [4]:
def calc_misclassification_cost(y, y_pred):
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    return 10*fp + 500*fn


misclassification_cost = make_scorer(
    calc_misclassification_cost,
    greater_is_better=False,
    needs_proba=False,
    needs_threshold=False
)

In [5]:
# df --> X_train & y_train
X_train = df_train.drop('class', axis=1)
y_train = df_train['class']

In [8]:
params_test = {
    'n_estimators': 150,
    'criterion': 'entropy',
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 6,
    'max_features': 'sqrt',
    'bootstrap': True,
    'max_leaf_nodes': None,
}

clf = RandomForestClassifier(random_state=1, n_estimators=150, n_jobs=-1, **params_test)

steps = [
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('rf_clf', clf)
]

pipe = Pipeline(steps=steps)

scores = cross_val_score(pipe, cv=3, X=X_train, y=y_train, scoring=misclassification_cost, n_jobs=-1)

In [None]:
space = {
    'fill_na': (hp.choice, (0, -1, -100, -10_000, -1_000_000, 'mean', 'most_frequent')),
    'sampling': (hp.choice, (None, 'smote')),
    'class_weight': (hp.choice, ('balanced', 'balanced_subsample')),
    'criterion': (hp.choice, ('gini', 'entropy', 'log_loss')),
    'max_depth': (hp.choice, (None, (hp.quniform, (2, 1000, 2)))),
    'min_samples_split': (hp.qloguniform, (2, 10000, 2)),
    'min_samples_leaf': (hp.qloguniform, (2, 10000, 2)),
    'max_features': (hp.choice, ('sqrt', 'log2', None)), 
    'max_leaf_nodes': (hp.qloguniform, (2, 10000, 2))
}

In [65]:
def write_results_file(cv_mean_cost, params):
    with open(r'./logs/log_rf_bayes_opt.txt', mode='a') as f:
        f.write('Cost = %i, Params = %s\n' % (cv_mean_cost, params))
    pass


def objective(params):
    print(params)
    
    # Initialize
    params_rf = {
        'class_weight': params['class_weight'],
        'max_depth': None if params['max_depth'] == None else int(params['max_depth']),
        'max_leaf_nodes': int(params['max_leaf_nodes'])
    }
    
    print(params_rf)

    steps = []

    # Impute
    if type(params['fill_na']) == int:
        steps.append(('impute', SimpleImputer(strategy='constant', fill_value=params['fill_na'])))
    else:
        steps.append(('impute', SimpleImputer(strategy=params['fill_na'])))

    # SMOTE
    if params['sampling'] == 'smote':
        steps.append(('smote', SMOTE(random_state=1)))
        steps.append(('rf_clf', RandomForestClassifier(random_state=1, n_estimators=150, n_jobs=-1, **params_rf)))
        pipe = imbPipeline(steps=steps)

    else:
        steps.append(('rf_clf', RandomForestClassifier(random_state=1, n_estimators=150, n_jobs=-1, **params_rf)))
        pipe = Pipeline(steps=steps)

    scores = cross_val_score(pipe, cv=5, X=X_train, y=y_train, scoring=misclassification_cost, n_jobs=-1)
    cv_mean_cost = -scores.mean()

    write_results_file(cv_mean_cost, params)

    return cv_mean_cost

In [66]:
# Test run
space = {
    'fill_na': hp.choice('fill_na', [0, 'mean', 'most_frequent']),
    'sampling': hp.choice('sampling', [None, 'smote']),
    'class_weight': hp.choice('class_weight', ['balanced', 'balanced_subsample']),
    'max_depth': hp.choice('max_depth', [None, hp.quniform('max_depth_int', 2, 50, 2)]),
    'max_leaf_nodes': hp.loguniform('max_leaf_nodes', 0.1, 10)
}

best = fmin(fn=objective, space=space, max_evals=10, rstate=np.random.default_rng(1), algo=tpe.suggest)

{'class_weight': 'balanced', 'fill_na': 'most_frequent', 'max_depth': 28.0, 'max_leaf_nodes': 34.066296707256896, 'sampling': None}
{'class_weight': 'balanced', 'max_depth': 28, 'max_leaf_nodes': 34}
{'class_weight': 'balanced', 'fill_na': 0, 'max_depth': 38.0, 'max_leaf_nodes': 1014.4419763257858, 'sampling': None}
{'class_weight': 'balanced', 'max_depth': 38, 'max_leaf_nodes': 1014}
{'class_weight': 'balanced_subsample', 'fill_na': 'most_frequent', 'max_depth': 44.0, 'max_leaf_nodes': 6417.028098616062, 'sampling': None}
{'class_weight': 'balanced_subsample', 'max_depth': 44, 'max_leaf_nodes': 6417}
{'class_weight': 'balanced_subsample', 'fill_na': 'mean', 'max_depth': 14.0, 'max_leaf_nodes': 3078.838118799532, 'sampling': 'smote'}
{'class_weight': 'balanced_subsample', 'max_depth': 14, 'max_leaf_nodes': 3078}
{'class_weight': 'balanced_subsample', 'fill_na': 0, 'max_depth': None, 'max_leaf_nodes': 407.5380221189031, 'sampling': 'smote'}
{'class_weight': 'balanced_subsample', 'max_de

In [67]:
best

{'class_weight': 1,
 'fill_na': 0,
 'max_depth': 1,
 'max_depth_int': 20.0,
 'max_leaf_nodes': 75.56350722991468,
 'sampling': 1}