# Random Forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import recall_score
from sklearn.model_selection import RandomizedSearchCV
from timeit import default_timer
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.model_selection import PredefinedSplit
#from sklearn.model_selection import cross_val_score




print(f"Notebook last run (end-to-end): {datetime.datetime.now()}")

Notebook last run (end-to-end): 2024-03-10 21:29:25.723503


In [2]:
SEED_ORIG = 2024
SEED = SEED_ORIG
NUM_SIMS = 10
rstate = np.random.default_rng(SEED_ORIG)

# Hyperparameter tuning with SMOTE and Hyperopt

In [3]:
df = pd.read_csv("../data/HTRU_2.csv",sep=",",header=None)
labels = df[8]
df = df.drop([8],axis=1)
df["Labels"] = labels
df["Labels"].value_counts()

0    16259
1     1639
Name: Labels, dtype: int64

In [4]:
X_train_, X_test = train_test_split(df, test_size=0.1, random_state=SEED_ORIG) # Splitting the test set
y_train_ = X_train_["Labels"]
X_train_ = X_train_.drop("Labels",axis=1)

In [5]:
sm = SMOTE(random_state=SEED_ORIG)
X_resampled,y_resampled= sm.fit_resample(X_train_, y_train_)
len(X_resampled), len(y_resampled)

(29282, 29282)

In [6]:
X_resampled["Labels"] = y_resampled

In [7]:
X_train, X_valid = train_test_split(X_resampled, test_size=0.1, random_state=SEED_ORIG) 
cols = X_train.columns[:-1]


In [8]:
len(X_train), len(X_valid), len(X_test)


(26353, 2929, 1790)

In [9]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,Labels
23647,88.252228,44.740535,1.334553,2.606667,8.658360,38.102912,4.774594,22.500578,1
15502,146.093750,46.782148,-0.182414,0.083016,1.823579,18.376583,10.816643,120.463003,0
18633,35.182888,37.299263,3.901432,16.401026,60.909358,69.969228,1.218334,0.519661,1
15513,114.156250,41.541289,0.577129,1.516991,1.039298,12.752767,14.106027,217.465501,0
25596,19.055439,35.070892,5.206158,27.650399,122.016522,74.897721,-0.105256,-1.007368,1
...,...,...,...,...,...,...,...,...,...
14875,126.125000,42.971284,-0.059258,0.265109,2.623746,19.325817,8.561876,80.169221,0
2688,113.398438,40.149397,0.322763,0.532786,2.985786,16.622323,8.176916,83.887394,0
19040,69.889086,32.188954,3.004694,14.108755,16.239157,47.742230,3.309714,10.573754,1
19962,65.243681,38.455560,2.207688,7.442995,37.377171,68.128617,1.624815,1.260027,1


In [10]:
X_train["Labels"].value_counts(), X_valid["Labels"].value_counts(), X_test["Labels"].value_counts()

(1    13189
 0    13164
 Name: Labels, dtype: int64,
 0    1477
 1    1452
 Name: Labels, dtype: int64,
 0    1618
 1     172
 Name: Labels, dtype: int64)

In [11]:
y_train = X_train["Labels"]
y_valid = X_valid["Labels"]
y_test = X_test["Labels"]
X_train = X_train.drop("Labels",axis=1)
X_valid = X_valid.drop("Labels",axis=1)
X_test = X_test.drop("Labels",axis=1)


In [12]:
len(X_train), len(X_valid)

(26353, 2929)

In [13]:
X_train_ = pd.concat([X_train,X_valid],axis=0,ignore_index=True)

In [14]:
y_train_ = pd.concat([y_train,y_valid],axis=0,ignore_index=True)

In [15]:
len(y_train_)

29282

In [16]:
# Creating the validation fold
valid_fold = np.concatenate([
    -np.ones(len(X_train)), # Training set
    np.zeros(len(X_valid)) # Validation set
])

In [17]:
valid_fold[:10], valid_fold[-10:]

(array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [18]:
ps = PredefinedSplit(test_fold=valid_fold)

In [19]:
#!pip install hyperopt

In [20]:
# Space
space = {
    'n_estimators': hp.choice('n_estimators', [int(x) for x in np.linspace(start = 50, stop = 1000, num = 10)]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'min_samples_split' : hp.choice('min_samples_split', [2, 5, 10, 14]),
    'min_samples_leaf' : hp.choice('min_samples_leaf', [1, 2, 4, 6, 8]),
}

In [21]:
# Objective function
def objective(params):

    rf=RandomForestClassifier(random_state=SEED,**params)

    """
    score = cross_val_score(estimator=xgboost,
                            X=X_train_,
                            y=y_train_,
                            cv=ps,
                            scoring='f1',
                            n_jobs=-1).mean()
    """
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_valid)
    score = f1_score(y_valid,y_pred)
    #score = recall_score(y_valid, y_pred)

    # Loss is negative score
    loss = - score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [22]:
best_models = []
times_train = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    start_time = default_timer()
    SEED = SEED_ORIG + i
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 10, trials = Trials(), rstate=rstate)
    best_models.append(best)
    end_time = default_timer()
    times_train.append(end_time-start_time)
    print(f"Training time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
100%|██████████| 10/10 [02:47<00:00, 16.79s/trial, best loss: -0.9723947550034506]
Training time: 167.91s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
100%|██████████| 10/10 [02:47<00:00, 16.78s/trial, best loss: -0.9734391169368747]
Training time: 167.80s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
100%|██████████| 10/10 [03:16

In [23]:
# best hyperparameters
best

{'max_depth': 5,
 'min_samples_leaf': 0,
 'min_samples_split': 1,
 'n_estimators': 0}

In [24]:
results_valid = []
results_test = []
times_eval = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    
    SEED = SEED_ORIG + i
    
    # Train model using the best parameters
    rf_bo = RandomForestClassifier(random_state=SEED,
                               n_estimators=space_eval(space, best_models[i])["n_estimators"],
                               max_depth=space_eval(space,best_models[i])["max_depth"],
                               min_samples_split=space_eval(space,best_models[i])["min_samples_split"],
                               min_samples_leaf=space_eval(space,best_models[i])["min_samples_leaf"]
                              ).fit(X_train,y_train)
    
    start_time = default_timer()
    y_pred = rf_bo.predict(X_valid)
    recall = recall_score(y_valid, y_pred)
    f1score = f1_score(y_valid,y_pred)
    mcc = matthews_corrcoef(y_valid, y_pred)
    acc = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_valid.append(results_temp)
    
    y_pred = rf_bo.predict(X_test)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test,y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_test.append(results_temp)
    
    end_time = default_timer()
    times_eval.append(end_time-start_time)
    print(f"Evaluation time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
Evaluation time: 0.22s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
Evaluation time: 0.39s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
Evaluation time: 0.06s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 3


In [25]:
results_test

[{'recall': 0.9127906976744186,
  'f1score': 0.8674033149171271,
  'mcc': 0.8538348765499884,
  'accuracy': 0.9731843575418995,
  'precision': 0.9084703947368421},
 {'recall': 0.9127906976744186,
  'f1score': 0.8626373626373627,
  'mcc': 0.8487302940523799,
  'accuracy': 0.9720670391061452,
  'precision': 0.9041607999582812},
 {'recall': 0.9127906976744186,
  'f1score': 0.8650137741046832,
  'mcc': 0.8512730466213766,
  'accuracy': 0.9726256983240223,
  'precision': 0.9063043328782059},
 {'recall': 0.9127906976744186,
  'f1score': 0.8698060941828255,
  'mcc': 0.8564160343643948,
  'accuracy': 0.9737430167597766,
  'precision': 0.9106593432015044},
 {'recall': 0.9127906976744186,
  'f1score': 0.8602739726027396,
  'mcc': 0.8462063728498886,
  'accuracy': 0.9715083798882681,
  'precision': 0.9020394457223875},
 {'recall': 0.9127906976744186,
  'f1score': 0.8486486486486486,
  'mcc': 0.8338609318958874,
  'accuracy': 0.9687150837988827,
  'precision': 0.8917535911882646},
 {'recall': 0.91

In [26]:
times_train

[167.9142983749989,
 167.80324730799475,
 196.19739941100124,
 186.19431457899918,
 166.5695278409985,
 137.320864789006,
 225.20823816600023,
 192.8496089409964,
 261.19711980399734,
 168.96166101699782]

In [27]:
results_valid

[{'recall': 0.9703856749311295,
  'f1score': 0.9723947550034506,
  'mcc': 0.9453758931086814,
  'accuracy': 0.9726869238648003,
  'precision': 0.9727084458347206},
 {'recall': 0.971763085399449,
  'f1score': 0.9734391169368747,
  'mcc': 0.9474224882212003,
  'accuracy': 0.9737111642198703,
  'precision': 0.9737278113683017},
 {'recall': 0.9703856749311295,
  'f1score': 0.9713891761461565,
  'mcc': 0.9433222263510977,
  'accuracy': 0.9716626835097303,
  'precision': 0.9716703504746983},
 {'recall': 0.9724517906336089,
  'f1score': 0.9737931034482759,
  'mcc': 0.9481037015497042,
  'accuracy': 0.9740525776715603,
  'precision': 0.9740646718470796},
 {'recall': 0.96900826446281,
  'f1score': 0.970010341261634,
  'mcc': 0.9405906662464595,
  'accuracy': 0.9702970297029703,
  'precision': 0.9703045436741089},
 {'recall': 0.953168044077135,
  'f1score': 0.9604441360166551,
  'mcc': 0.9222461856655704,
  'accuracy': 0.9610788665073404,
  'precision': 0.9612342960575115},
 {'recall': 0.9696969

In [28]:
# validation f1-scores
values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]

In [29]:
# the best f1-score model
ind = np.argmax(np.array(values))
ind, np.max(np.array(values))

(7, 0.9772883688919477)

In [30]:
# results corresponding to the best validation f1-score
results_test[ind]

{'recall': 0.9127906976744186,
 'f1score': 0.8674033149171271,
 'mcc': 0.8538348765499884,
 'accuracy': 0.9731843575418995,
 'precision': 0.9084703947368421}

In [31]:
# average training times
np.mean(times_train)

187.02162802309903

In [32]:
# average inference times
np.mean(times_eval)

0.3746520758984843

In [33]:
# average validation f1 scores
valid_f1score_values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(valid_f1score_values)

0.9711176872313813

In [34]:
# average test f1 scores
test_f1score_values = [results_test[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(test_f1score_values)

0.8621339578719087

In [35]:
# average test mcc scores
test_mcc_values = [results_test[i]["mcc"] for i in range(NUM_SIMS)]
np.mean(test_mcc_values)

0.8481855484767031

In [36]:
# average test accuracy scores
test_acc_values = [results_test[i]["accuracy"] for i in range(NUM_SIMS)]
np.mean(test_acc_values)

0.9719553072625698

In [37]:
# average test precision scores
test_precision_values = [results_test[i]["precision"] for i in range(NUM_SIMS)]
np.mean(test_precision_values)

0.9039494010880549

In [38]:
# average test recall scores
test_recall_values = [results_test[i]["recall"] for i in range(NUM_SIMS)]
np.mean(test_recall_values)

0.9122093023255813