In [1]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (f1_score, recall_score, precision_score, 
                             roc_auc_score, balanced_accuracy_score)

In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy

In [21]:
# preprocess data and drop unused columns

data = pd.read_csv("processed_data.csv")
t_ys = data["result"].values
Xs = data.drop(["result"], axis=1).fillna(0.)
nan_mask = np.all(data == 0, axis=0)
t_Xs = Xs.drop(list(data.columns[nan_mask]), axis=1)


In [8]:
# held-out set
Xs, hXs, ys, hys = train_test_split(t_Xs,t_ys, train_size=0.8, random_state=49, stratify=t_ys)

In [None]:
# recursive feature reduction

clf = RandomForestClassifier(criterion="entropy", random_state=49, class_weight="balanced")
rfecv = RFECV(
    estimator=clf,
    cv=5,
    scoring="accuracy",
    min_features_to_select=20,
)
rfecv.fit(Xs.values, ys)

print(f"Optimal number of features: {rfecv.n_features_}")

In [20]:
train_size = np.arange(0.4, 1, 0.1)
X_REF = Xs.values[:, rfecv.support_]
params = {"n_estimators": np.arange(50, 250, 50),
            "min_samples_leaf": np.arange(5, 30)}
REF_ac = {"train": [], "test": [], "held out": [], "f1": [], "p": [], "r": [], "auc": []}
prev = 0
prev_test = 0

np.random.seed(49)
split_states = np.random.randint(100, size=10)

for s in train_size:
    train_ac = np.zeros(10)
    test_ac = np.zeros(10)
    held_ac = np.zeros(10)
    test_F1 = np.zeros(20)
    test_p = np.zeros(20)
    test_r = np.zeros(20)
    test_auc = np.zeros(20)
    X_train, X_test, y_train, y_test = train_test_split(X_REF, ys, train_size=s, random_state=49) 
    for i, si in enumerate(split_states):
        # hyperparameter tunning 
        rf = RandomForestClassifier(criterion="entropy", random_state=49, class_weight="balanced")
        clf = RandomizedSearchCV(rf, params, n_iter=10, random_state=si)
        search = clf.fit(X_train, y_train)
        
        #print(search.best_params_)
        best_rf = clf.best_estimator_
    
        test_pred = best_rf.predict(X_test)
        htest_pred = best_rf.predict(hXs.values[:, rfecv.support_])
        train_pred = best_rf.predict(X_train)
        test_ac[i] = balanced_accuracy_score(y_test, test_pred)
        held_ac[i] = balanced_accuracy_score(hys, htest_pred)
        if test_ac[-1] > prev_test:
            prev_test = test_ac[-1]
            best_test_model = deepcopy(best_rf)
        if held_ac[-1] > prev:
            prev = held_ac[-1]
            best_model = deepcopy(best_rf)

        #print(f"seed {i}, balanced accuracy {test_ac[-1]}, accuracy {best_rf.score(X_test, y_test)}")
        test_F1[2*i] = f1_score(y_test, test_pred)
        test_p[2*i] = precision_score(y_test, test_pred)
        test_r[2*i] = recall_score(y_test, test_pred)
        test_auc[2*i] = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
        train_ac[i] = balanced_accuracy_score(y_train, train_pred)

        test_F1[2*i+1] = f1_score(hys, htest_pred)
        test_p[2*i+1] = precision_score(hys, htest_pred)
        test_r[2*i+1] = recall_score(hys, htest_pred)
        test_auc[2*i+1] = roc_auc_score(hys, best_rf.predict_proba(hXs.values[:, rfecv.support_])[:, 1])
    
    REF_ac["train"] += [train_ac]
    REF_ac["test"] += [test_ac]
    REF_ac["held out"] += [held_ac]
    REF_ac["f1"] += [test_F1[0::2], test_F1[1::2]]
    REF_ac["p"] += [test_p[0::2], test_p[1::2]]
    REF_ac["r"] += [test_r[0::2], test_r[1::2]]
    REF_ac["auc"] += [test_auc[0::2], test_auc[1::2]]