# Random Forest

In [None]:
import os
import math
import pandas as pd
from joblib import dump, load
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
ROOT_PATH = "."

## Read in dataset

In [None]:
X_train = pd.read_csv(os.path.join(ROOT_PATH, "data", "split_train_values.csv"))
y_train = pd.read_csv(os.path.join(ROOT_PATH, "data", "split_train_labels.csv"))
X_test = pd.read_csv(os.path.join(ROOT_PATH, "data", "split_test_values.csv"))
y_test = pd.read_csv(os.path.join(ROOT_PATH, "data", "split_test_labels.csv"))

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

display(X_train)

## Hyperparameter Tuning

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.metrics import multilabel_confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, cross_validate, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import xgboost as xgb

In [None]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(42))
print(pipeline)

In [None]:
def save_cv(cv_results_, filepath):
    cv_results = pd.DataFrame(cv_results_)
    cv_results.to_csv(filepath)

In [None]:
hp_space = {'randomforestclassifier__max_depth': np.arange(30,50),
            'randomforestclassifier__min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
            'randomforestclassifier__min_samples_leaf': np.arange(1,10),
            'randomforestclassifier__max_features': list(range(1,len(X_train.columns))),
            'randomforestclassifier__n_estimators': [16, 32, 40, 50, 60, 128]
}


randcv = RandomizedSearchCV(pipeline, hp_space, cv=5, verbose=2, scoring='f1_micro', n_iter=100, n_jobs=-2)

In [None]:
randcv.fit(X_train, y_train)

In [None]:
print("Finished training RF with randomized search CV. Saving model...")
print("All ")
print("Best params: {}".format(randcv.best_params_))
print("Best Micro F1 score: {}".format(randcv.best_score_))
print("Summary of CV_results: {}".format(randcv.cv_results_.keys()))
save_cv(randcv.cv_results_, os.path.join(ROOT_PATH, "results", "rf_rand1.csv"))

In [None]:
hp_space2 = {'randomforestclassifier__max_depth': [34, 36, 40],
            'randomforestclassifier__min_samples_split': [0.1, 0.2, 0.3],
            'randomforestclassifier__min_samples_leaf': [3, 4],
            'randomforestclassifier__max_features': [56, 64, 68],
            'randomforestclassifier__n_estimators': [60, 128]
}

gscv1 = GridSearchCV(pipeline, hp_space2, cv=5, scoring='f1_micro', n_jobs=-2, return_train_score=True, verbose=True)

In [None]:
gscv1.fit(X_train, y_train)

In [None]:
print("Finished training RF with grid search CV. Saving model...")
print("All ")
print("Best params: {}".format(gscv1.best_params_))
print("Best Micro F1 score: {}".format(gscv1.best_score_))
print("Summary of CV_results: {}".format(gscv1.cv_results_.keys()))
save_cv(gscv1.cv_results_, os.path.join(ROOT_PATH, "results", "rf_grid2.csv"))

## Retraining then storing the model

In [None]:
rf2 = RandomForestClassifier(max_depth=42, max_features=56, min_samples_leaf=7, min_samples_split=2, 
                             n_estimators=60, random_state=1000)
pipeline = make_pipeline(StandardScaler(), rf2)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
print("Micro F1-score random forest: {}".format(f1_score(y_pred, y_test, average='micro')))

In [None]:
from joblib import dump
dump(pipeline, os.path.join(ROOT_PATH,'models', 'rfv2.joblib'))