In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report

In [2]:
csv_file = r"..\data_types.csv"

dtypes = {}

with open(csv_file, "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        column_name = row["Column Name"]
        data_type = row["Data Type"]
        dtypes[column_name] = data_type

In [3]:
df_train = pd.read_csv(r'..\train_dump.csv', dtype=dtypes)

In [4]:
df_test = pd.read_csv(r'..\test_dump.csv', dtype=dtypes)

In [5]:
# remove the index column thats read from the csv
df_train = df_train.iloc[:, 1:]
df_test = df_test.iloc[:, 1:]

In [6]:
x_train = df_train[df_train.columns[:-1]].to_numpy()
y_train = df_train['label'].to_numpy()

x_test = df_test[df_test.columns[:-1]].to_numpy()
y_test = df_test['label'].to_numpy()

In [7]:
kf = KFold(n_splits=5)

f1s = []
accuracies = []
precisions = []
recalls = []
roc_aucs = []

In [8]:
model = RandomForestClassifier()

fold = 1
for train, valid in kf.split(x_train, y_train):
    print(f"##### FOLD: {fold} #####")

    # Fit the model
    model.fit(x_train[train], y_train[train])

    # Predict on the test set
    predictions = model.predict(x_train[valid])

    # Evaluate the model
    precision = precision_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)
    recall = recall_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)
    accuracy = accuracy_score(y_true=y_train[valid], y_pred=predictions)
    f1 = f1_score(y_true=y_train[valid], y_pred=predictions, zero_division=0)

    # Store the result
    f1s.append(f1)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)

    # Print the scores for each fold
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")
    print(f"Accuracy = {accuracy}")
    print(f"F1 score = {f1}\n")

    fold += 1

print("\nMean Scores: ")
print(f"Mean F1 score = {np.mean(f1s)}")
print(f"Mean Accuracy = {np.mean(accuracies)}")
print(f"Mean Precision = {np.mean(precisions)}")
print(f"Mean Recall = {np.mean(recalls)}")

##### FOLD: 1 #####


In [None]:
y_pred = model.predict(x_test)

In [None]:
classification = classification_report(y_test, y_pred, zero_division=1)
print("\nClassification Report:\n", classification)


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98      8062
           1       1.00      1.00      1.00    353872

    accuracy                           1.00    361934
   macro avg       1.00      0.98      0.99    361934
weighted avg       1.00      1.00      1.00    361934

