# Random Forests


In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    precision_score,
    recall_score,
    accuracy_score,
    f1_score,
)
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from binning import bin

Read the train and test datasets


In [2]:
train_df = pd.read_csv("Training_set.csv")
test_df = pd.read_csv("Testing_set.csv")

Convert training set OHE columns to int8


In [3]:
columns_to_convert = train_df.columns[
    17:-1
]  # Columns from 17th column up to but excluding the last column
train_df[columns_to_convert] = train_df[columns_to_convert].astype("int8")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385516 entries, 0 to 385515
Columns: 4073 entries, ScheduledArrTime to DepDelay
dtypes: float64(12), int64(6), int8(4055)
memory usage: 1.5 GB


Convert testing set OHE columns to int8


In [4]:
columns_to_convert = test_df.columns[
    17:-1
]  # Columns from 17th column up to but excluding the last column
test_df[columns_to_convert] = test_df[columns_to_convert].astype("int8")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96379 entries, 0 to 96378
Columns: 4073 entries, ScheduledArrTime to DepDelay
dtypes: float64(12), int64(6), int8(4055)
memory usage: 385.9 MB


Get the training features and label


In [5]:
X_train = train_df.iloc[:, :-1].to_numpy()
y_train = train_df.iloc[:, -1].to_numpy()

Get the testing features and label


In [6]:
X_test = test_df.iloc[:, :-1].to_numpy()
y_test = test_df.iloc[:, -1].to_numpy()

Initialize a 5-fold cross-validation object


In [7]:
kf = KFold(n_splits=5)

Bin the label


In [8]:
y_train_binned = bin(y_train)
y_test_binned = bin(y_test)

Create a new RandomForest classifier with the best parameter values


In [9]:
rfc = RandomForestClassifier(n_jobs=5, class_weight={0: 10, 1: 1})

Initialize scores lists


In [10]:
precisions = []
recalls = []
accuracies = []
f1_scores = []

Train the model using 5-fold cross-validation


In [11]:
fold = 1
for train, test in kf.split(X_train, y_train_binned):
    print(f"##### FOLD: {fold} #####")

    # Fit the model
    rfc.fit(X_train[train], y_train_binned[train])

    # Predict on the test set
    predictions = rfc.predict(X_train[test])

    # Evaluate the model
    precision = precision_score(
        y_true=y_train_binned[test],
        y_pred=predictions,
        zero_division=0,
        average="weighted",
    )
    recall = recall_score(
        y_true=y_train_binned[test],
        y_pred=predictions,
        zero_division=0,
        average="weighted",
    )
    accuracy = accuracy_score(y_true=y_train_binned[test], y_pred=predictions)
    f1 = f1_score(
        y_true=y_train_binned[test],
        y_pred=predictions,
        zero_division=0,
        average="weighted",
    )

    # Store the result
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    f1_scores.append(f1)

    # Print the scores for each fold
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")
    print(f"Accuracy = {accuracy}")
    print(f"F1 score = {f1}\n")
    print(
        classification_report(
            y_true=y_train_binned[test], y_pred=predictions, zero_division=0
        )
    )

    fold += 1

##### FOLD: 1 #####
Precision = 0.6344091621191714
Recall = 0.6524434529985474
Accuracy = 0.6524434529985474
F1 score = 0.6318950230324862

              precision    recall  f1-score   support

           0       0.55      0.36      0.43     28516
           1       0.69      0.83      0.75     48588

    accuracy                           0.65     77104
   macro avg       0.62      0.59      0.59     77104
weighted avg       0.63      0.65      0.63     77104

##### FOLD: 2 #####
Precision = 0.6328966030406921
Recall = 0.6510252519357224
Accuracy = 0.6510252519357224
F1 score = 0.6314776839781701

              precision    recall  f1-score   support

           0       0.54      0.36      0.43     28427
           1       0.69      0.82      0.75     48676

    accuracy                           0.65     77103
   macro avg       0.61      0.59      0.59     77103
weighted avg       0.63      0.65      0.63     77103

##### FOLD: 3 #####
Precision = 0.6321077059365956
Recall = 0.6505

Print the mean scores of the folds


In [12]:
print("Mean Scores:")
print(f"Mean Precision = {np.mean(precisions)}")
print(f"Mean Recall = {np.mean(recalls)}")
print(f"Mean Accuracy = {np.mean(accuracies)}")
print(f"Mean F1 score = {np.mean(f1_scores)}")

Mean Scores:
Mean Precision = 0.6332960484338471
Mean Recall = 0.6516305398143963
Mean Accuracy = 0.6516305398143963
Mean F1 score = 0.6311723278492347


Make predictions on the test set


In [13]:
predictions = rfc.predict(X_test)

Evaluate the model


In [14]:
print(classification_report(y_true=y_test_binned, y_pred=predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.54      0.36      0.43     35520
           1       0.69      0.82      0.75     60859

    accuracy                           0.65     96379
   macro avg       0.61      0.59      0.59     96379
weighted avg       0.63      0.65      0.63     96379



In [15]:
precision = precision_score(
    y_true=y_test_binned,
    y_pred=predictions,
    zero_division=0,
    average="weighted",
)
recall = recall_score(
    y_true=y_test_binned,
    y_pred=predictions,
    zero_division=0,
    average="weighted",
)
accuracy = accuracy_score(y_true=y_test_binned, y_pred=predictions)
f1 = f1_score(
    y_true=y_test_binned,
    y_pred=predictions,
    zero_division=0,
    average="weighted",
)

In [16]:
print("Mean Scores:")
print(f"Mean Precision = {np.mean(precision)}")
print(f"Mean Recall = {np.mean(recall)}")
print(f"Mean Accuracy = {np.mean(accuracy)}")
print(f"Mean F1 score = {np.mean(f1)}")

Mean Scores:
Mean Precision = 0.6338206931450611
Mean Recall = 0.6520922607621993
Mean Accuracy = 0.6520922607621993
Mean F1 score = 0.6318835579414409
