## Random Forest
This notebook fits a Random Forest model to the Scania Trucks Air Pressure System (APS) predictive maintenance dataset, obtained from [UCI's data repository](https://archive.ics.uci.edu/ml/datasets/APS+Failure+at+Scania+Trucks). 

In the Logistic Regression & Naive Bayes notebooks, grid searches were used for hyperparameter tuning. However, significantly more computational resources are required to fit a Random Forest model. Thus, this notebook will use Bayesian Optimization in lieu of grid search.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from hyperopt import fmin, tpe, hp, Trials
from collections import defaultdict
from functools import partial

from sklearn.impute import SimpleImputer
from sklearn.metrics import auc, roc_curve, precision_recall_curve, make_scorer, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

sns.set()

### Load Data

In [2]:
df_train = pd.read_csv(r'./data/aps_failure_training_set_data_only.csv')

In [3]:
# df --> X_train & y_train
X_train = df_train.drop('class', axis=1)
y_train = df_train['class']

### Bayesian Optimization

In [4]:
def calc_misclassification_cost(y, y_pred):
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    return 10*fp + 500*fn


misclassification_cost = make_scorer(
    calc_misclassification_cost,
    greater_is_better=False,
    needs_proba=False,
    needs_threshold=False
)

In [5]:
class LogFile:
    def __init__(self, file_path):
        self.i = 1
        self.file_path = file_path
        pass

    def write_log_file(self, cv_mean_cost, params, params_skl):
        
        # Initialize
        dd = defaultdict(list)

        # Combine hyperopt params & skl params
        for d in (params, params_skl):
            for key, value in d.items():
                dd[key].append(value)

        # Create df & add iteration / cost
        df = pd.DataFrame(dd, index=['hyperopt', 'sklearn'])
        df['Iteration'] = self.i
        df['cv_mean_cost'] = cv_mean_cost

        # Reorder & append to log file
        df = df[['Iteration', 'cv_mean_cost', *dd.keys()]]

        # Write header
        if self.i == 1:
            df.to_csv(self.file_path, mode='w', header=True)
        else:
            df.to_csv(self.file_path, mode='a', header=False)

        # Increase iteration number
        self.i += 1

        pass

In [10]:
def create_sklearn_pipeline(params, fill_na, sampling):
    steps = []

    # Impute
    if type(fill_na) == int:
        steps.append(('impute', SimpleImputer(strategy='constant', fill_value=fill_na)))
    else:
        steps.append(('impute', SimpleImputer(strategy=fill_na)))

    # SMOTE
    if sampling == 'smote':
        steps.append(('smote', SMOTE(random_state=1)))
        steps.append(('rf_clf', RandomForestClassifier(random_state=1, n_estimators=200, n_jobs=-1, **params)))
        pipe = imbPipeline(steps=steps)

    else:
        steps.append(('rf_clf', RandomForestClassifier(random_state=1, n_estimators=200, n_jobs=-1, **params)))
        pipe = Pipeline(steps=steps)

    return pipe


def objective(params, LogFile):

    # Initialize
    params_rf = {
        'class_weight': params['class_weight'],
        'criterion': params['criterion'],
        'max_depth': None if params['max_depth'] == None else int(params['max_depth']),
        'min_samples_split': int(params['min_samples_split']),
        'min_samples_leaf': int(params['min_samples_split']),
        'max_features': params['max_features'],
        'max_leaf_nodes': None if params['max_leaf_nodes'] == None else int(params['max_leaf_nodes'])
    }

    # Create pipeline
    pipe = create_sklearn_pipeline(params_rf, params['fill_na'], params['sampling'])

    # Fit data & calculate CV score
    scores = cross_val_score(pipe, cv=5, X=X_train, y=y_train, scoring=misclassification_cost, n_jobs=-1)
    cv_mean_cost = -scores.mean()

    # Write params & results to log
    LogFile.write_log_file(cv_mean_cost, params, params_rf)

    return cv_mean_cost

In [11]:
%%time

space = {
    'fill_na': hp.choice('fill_na', [0, -1, -100, -10_000, -1_000_000, 'mean', 'most_frequent']),
    'sampling': hp.choice('sampling', [None, 'smote']),
    'class_weight': hp.choice('class_weight', ['balanced', 'balanced_subsample']),
    'criterion': hp.choice('criterion', ['gini', 'entropy', 'log_loss']),
    'max_depth': hp.choice('max_depth', [None, hp.quniform('max_depth_int', 2, 1000, 2)]),
    'min_samples_split': hp.uniform('min_samples_split', 2, 500),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 2, 500),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', [None, hp.uniform('max_leaf_nodes_int', 100, 500)])
}

# Set up log file
log_file = LogFile(r'./logs/log_rf_bayes_opt.csv')

# Set up objective function & pass log file
f_objective = partial(objective, LogFile=log_file)

# Bayesian Optimization
trials = Trials()
best = fmin(fn=f_objective, space=space, max_evals=500, rstate=np.random.default_rng(1), algo=tpe.suggest, trials=trials)

  1%|          | 3/500 [05:33<17:23:18, 125.95s/trial, best loss: 10178.0]

In [None]:
best

### Best Estimator - Plots & Post-Processing

In [None]:
pipe = create_sklearn_pipeline(best)
pipe.fit(X_train, y_train)

scores = cross_val_score(pipe, cv=5, X=X_train, y=y_train, scoring=misclassification_cost, n_jobs=-1)
print(-scores.mean())

In [None]:
def plot_roc_auc(y_true, y_pred, model_name, file_path, figsize=(10, 8)):
    # Create figure
    fig = plt.figure(figsize=figsize)

    # Calculate ROC Curve & AUC
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    area = auc(fpr, tpr)
    plt.title('ROC Curve | %s | AUC = %0.5f' % (model_name, area))
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')

    # Save & close plot
    plt.plot(fpr, tpr)
    fig.savefig(file_path)
    plt.close(fig)

    return area


def plot_precision_recall_auc(y_true, y_pred, model_name, file_path, figsize=(10, 8)):
    # Create figure
    fig = plt.figure(figsize=figsize)

    # Calculate ROC Curve & AUC
    pr, rc, thresholds = precision_recall_curve(y_true, y_pred)
    area = auc(rc, pr)
    plt.title('Precision-Recall Curve | %s | AUC = %0.5f' % (model_name, area))
    plt.xlabel('Recall')
    plt.ylabel('Precision')

    # Save & close plot
    plt.plot(rc, pr)
    fig.savefig(file_path)
    plt.close(fig)

    return area

probs = pipe.predict_proba(X_train)

plot_roc_auc(y_train.replace({'neg': 0, 'pos': 1}), probs[:, 1], 'Complement Naive Bayes', r'./results/cnb_roc.jpg');
plot_precision_recall_auc(y_train.replace({'neg': 0, 'pos': 1}), probs[:, 1], 'Complement Naive Bayes', r'./results/cnb_pr.jpg');

### Archive

In [None]:
params_test = {
    'n_estimators': 150,
    'criterion': 'entropy',
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 6,
    'max_features': 'sqrt',
    'bootstrap': True,
    'max_leaf_nodes': None,
}

clf = RandomForestClassifier(random_state=1, n_estimators=150, n_jobs=-1, **params_test)

steps = [
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('rf_clf', clf)
]

pipe = Pipeline(steps=steps)

scores = cross_val_score(pipe, cv=3, X=X_train, y=y_train, scoring=misclassification_cost, n_jobs=-1)

In [None]:
%%time
# Test run

trials = Trials()
space = {
    'fill_na': hp.choice('fill_na', [0, 'mean', 'most_frequent']),
    'sampling': hp.choice('sampling', [None, 'smote']),
    'class_weight': hp.choice('class_weight', ['balanced', 'balanced_subsample']),
    'max_depth': hp.choice('max_depth', [None, hp.quniform('max_depth_int', 2, 50, 2)]),
    'max_leaf_nodes': hp.loguniform('max_leaf_nodes', 0.1, 10)
}

log_file = LogFile(r'./logs/log_rf_bayes_opt.csv')

f_objective = partial(objective, LogFile=log_file)

best = fmin(fn=f_objective, space=space, max_evals=10, rstate=np.random.default_rng(1), algo=tpe.suggest, trials=trials)