In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (f1_score, recall_score, precision_score, 
                             roc_auc_score, balanced_accuracy_score)

In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy

In [None]:
# preprocess data and drop unused columns

data = pd.read_csv("processed_data.csv")
ys = data["result"].values
Xs = data.drop(["result"], axis=1).fillna(0.)
nan_mask = np.all(data == 0, axis=0)
Xs = Xs.drop(list(data.columns[nan_mask]), axis=1)


In [None]:
# recursive feature reduction

clf = RandomForestClassifier(criterion="entropy", random_state=42, class_weight="balanced")
rfecv = RFECV(
    estimator=clf,
    cv=5,
    scoring="accuracy",
    min_features_to_select=20,
)
rfecv.fit(Xs.values, ys)

print(f"Optimal number of features: {rfecv.n_features_}")

In [None]:
train_size = np.arange(0.4, 1, 0.1)
X_REF = Xs.values[:, rfecv.support_]
params = {"n_estimators": np.arange(50, 250, 50),
            "min_samples_leaf": np.arange(5, 30)}
REF_ac = {"train": [], "test": [], "f1": [], "p": [], "r": [], "auc": []}
prev = 0

np.random.seed(49)
split_states = np.random.randint(100, size=10)

for s in train_size:
    train_ac = []
    test_ac = []
    test_F1 = []
    test_p = []
    test_r = []
    test_auc = []
    X_train, X_test, y_train, y_test = train_test_split(X_REF, ys, train_size=s, random_state=42)
    for i in split_states:
        # hyperparameter tunning 
        rf = RandomForestClassifier(criterion="entropy", class_weight="balanced", random_state=49)
        clf = RandomizedSearchCV(rf, params, n_iter=10, random_state=i)
        search = clf.fit(X_train, y_train)
        
        #print(search.best_params_)
        best_rf = deepcopy(clf.best_estimator_)
        
        #kf = StratifiedKFold(5, shuffle=True, random_state=42)
    
        test_pred = best_rf.predict(X_test)
        train_pred = best_rf.predict(X_train)
        test_ac.append(balanced_accuracy_score(y_test, test_pred))
        if test_ac[-1] > prev:
            prev = test_ac[-1]
            best_model = best_rf

        #print(f"seed {i}, balanced accuracy {test_ac[-1]}, accuracy {best_rf.score(X_test, y_test)}")
        test_F1.append(f1_score(y_test, test_pred))
        test_p.append(precision_score(y_test, test_pred))
        test_r.append(recall_score(y_test, test_pred))
        test_auc.append(roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1]))
        train_ac.append(balanced_accuracy_score(y_train, train_pred))
    REF_ac["train"] += [[np.mean(train_ac), np.std(train_ac)]]
    REF_ac["test"] += [[np.mean(test_ac), np.std(test_ac)]]
    REF_ac["f1"] += [[np.mean(test_F1), np.std(test_F1)]]
    REF_ac["p"] += [[np.mean(test_p), np.std(test_p)]]
    REF_ac["r"] += [[np.mean(test_r), np.std(test_r)]]
    REF_ac["auc"] += [[np.mean(test_auc), np.std(test_auc)]]