# Random Forests


In [1]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    classification_report,
    r2_score,
    precision_score,
    recall_score,
    accuracy_score,
    f1_score,
)
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from binning import bin

Read the train and test datasets


In [2]:
train_df = pd.read_csv("Training_set.csv")
test_df = pd.read_csv("Testing_set.csv")

Get the training features and label


In [3]:
X_train = train_df.iloc[:, :-1].to_numpy()
y_train = train_df.iloc[:, -1].to_numpy()

Get the testing features and label


In [4]:
X_test = test_df.iloc[:, :-1].to_numpy()
y_test = test_df.iloc[:, -1].to_numpy()

Initialize a 5-fold cross-validation object


In [5]:
kf = KFold(n_splits=5)

## Random Forests Classifier


Bin the label


In [6]:
y_train_binned = bin(y_train)
y_test_binned = bin(y_test)

Initialize a RandomForestClassifier


In [7]:
rfc = RandomForestClassifier(n_jobs=4)

Initialize scores lists


In [8]:
precisions = []
recalls = []
accuracies = []
f1_scores = []

Train the model using 5-fold cross-validation


In [9]:
fold = 1
for train, test in kf.split(X_train, y_train_binned):
    print(f"##### FOLD: {fold} #####")

    # Fit the model
    rfc.fit(X_train[train], y_train_binned[train])

    # Predict on the test set
    predictions = rfc.predict(X_train[test])

    # Evaluate the model
    precision = precision_score(
        y_true=y_train_binned[test],
        y_pred=predictions,
        zero_division=0,
        average="weighted",
    )
    recall = recall_score(
        y_true=y_train_binned[test],
        y_pred=predictions,
        zero_division=0,
        average="weighted",
    )
    accuracy = accuracy_score(y_true=y_train_binned[test], y_pred=predictions)
    f1 = f1_score(
        y_true=y_train_binned[test],
        y_pred=predictions,
        zero_division=0,
        average="weighted",
    )

    # Store the result
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    f1_scores.append(f1)

    # Print the scores for each fold
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")
    print(f"Accuracy = {accuracy}")
    print(f"F1 score = {f1}\n")

    fold += 1

##### FOLD: 1 #####
Precision = 0.46660586555020495
Recall = 0.47609721934011207
Accuracy = 0.47609721934011207
F1 score = 0.4172043113027338

##### FOLD: 2 #####
Precision = 0.47088668881328233
Recall = 0.47776351114742616
Accuracy = 0.47776351114742616
F1 score = 0.41845069563153564

##### FOLD: 3 #####
Precision = 0.4653494075930285
Recall = 0.4747934581015006
Accuracy = 0.4747934581015006
F1 score = 0.41504673188516056

##### FOLD: 4 #####
Precision = 0.4694089649991363
Recall = 0.4789567202313788
Accuracy = 0.4789567202313788
F1 score = 0.4206364440800337

##### FOLD: 5 #####
Precision = 0.46968703634259634
Recall = 0.47732254257292195
Accuracy = 0.47732254257292195
F1 score = 0.4184898674918084



Print the mean scores of the folds


In [10]:
print("Mean Scores:")
print(f"Mean Precision = {np.mean(precisions)}")
print(f"Mean Recall = {np.mean(recalls)}")
print(f"Mean Accuracy = {np.mean(accuracies)}")
print(f"Mean F1 score = {np.mean(f1_scores)}")

Mean Scores:
Mean Precision = 0.4683875926596497
Mean Recall = 0.47698669027866797
Mean Accuracy = 0.47698669027866797
Mean F1 score = 0.4179656100782544


Make predictions on the test set


In [11]:
predictions = rfc.predict(X_test)

Evaluate the model


In [12]:
print(classification_report(y_true=y_test_binned, y_pred=predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.31      0.02      0.03     11718
           1       0.49      0.78      0.60     42216
           2       0.44      0.36      0.40     32447
           3       0.61      0.08      0.14      9998

    accuracy                           0.47     96379
   macro avg       0.46      0.31      0.29     96379
weighted avg       0.46      0.47      0.42     96379



## Random Forests Regressor


Initialize a RandomForestRegressor


In [13]:
rfr = RandomForestRegressor(n_jobs=4)

Initialize scores lists


In [14]:
r2_scores = []

Train the model using 5-fold cross-validation


In [15]:
fold = 1
for train, test in kf.split(X_train, y_train):
    print(f"##### FOLD: {fold} #####")

    # Fit the model
    rfr.fit(X_train[train], y_train[train])

    # Predict on the test set
    predictions = rfr.predict(X_train[test])

    # Evaluate the model
    r2 = r2_score(y_true=y_train[test], y_pred=predictions)

    # Store the result
    r2_scores.append(r2)

    # Print the scores for each fold
    print(f"R2 score = {r2}")

    fold += 1

##### FOLD: 1 #####
R2 score = 0.10097462198691853
##### FOLD: 2 #####
R2 score = 0.08919078099226141
##### FOLD: 3 #####
R2 score = 0.09476152873124877
##### FOLD: 4 #####
R2 score = 0.09542324038510774
##### FOLD: 5 #####
R2 score = 0.09483583931662942


Print the mean scores of the folds


In [16]:
print(f"Mean R2 score = {np.mean(r2_scores)}")

Mean R2 score = 0.09503720228243318


Make predictions on the test set


In [17]:
predictions = rfr.predict(X_test)

Evaluate the model


In [18]:
print(r2_score(y_true=y_test, y_pred=predictions))

0.08964064245552128
