# Decision Trees Classifier


In [54]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)
import pandas as pd
import numpy as np

Read the train and test datasets


In [25]:
train_df = pd.read_csv("Training_Set_binned.csv")
test_df = pd.read_csv("Testing_Set_binned.csv")

Get the training features and label


In [46]:
X = train_df.iloc[:, :-1].to_numpy()
y = train_df.iloc[:, -1].to_numpy()

Get the testing features and label


In [52]:
X_test = test_df.iloc[:, :-1].to_numpy()
y_test = test_df.iloc[:, -1].to_numpy()

Initialize a DecisionTreeClassifier


In [63]:
dtc = DecisionTreeClassifier()

Initialize a KFold object


In [64]:
kf = KFold(n_splits=5)

Train the model using 5-Fold cross-validation


In [None]:
# Lists to store each fold's scores
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Loop through each fold
for train_index, test_index in kf.split(X):
    X_train, X_validation = X[train_index], X[test_index]
    y_train, y_validation = y[train_index], y[test_index]

    # Fit the model on the training data
    dtc.fit(X_train, y_train)

    # Make predictions on the validation data
    y_pred = dtc.predict(X_validation)

    # Calculate metrics and append to lists
    accuracies.append(accuracy_score(y_validation, y_pred))
    precisions.append(precision_score(y_validation, y_pred, average="weighted"))
    recalls.append(recall_score(y_validation, y_pred, average="weighted"))
    f1_scores.append(f1_score(y_validation, y_pred, average="weighted"))

Print the mean of each metric across all folds


In [66]:
print(f"Mean accuracy: {np.mean(accuracies)}")
print(f"Mean precision: {np.mean(precisions)}")
print(f"Mean recall: {np.mean(recalls)}")
print(f"Mean F1-score: {np.mean(f1_scores)}")

Mean accuracy: 0.369712879856777
Mean precision: 0.367929663156589
Mean recall: 0.369712879856777
Mean F1-score: 0.36877726878833367


Make the predictions on the test set


In [None]:
y_pred = dtc.predict(X_test)

Evaluate the prediction


In [68]:
print(classification_report(y_true=y_test, y_pred=y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.43      0.44      0.44     44425
           1       0.34      0.34      0.34     37819
           2       0.35      0.35      0.35     33792
           3       0.11      0.09      0.10      3673
           4       0.14      0.11      0.12       765

    accuracy                           0.37    120474
   macro avg       0.27      0.26      0.27    120474
weighted avg       0.37      0.37      0.37    120474

