<a href="https://colab.research.google.com/github/trivedidharmik/cs3735/blob/main/Trivedi_Dharmik_As2Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest Class

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd

class RandomForest:
    def __init__(self, nTrees=10, maxDepth=None, minSplit=2, nFeatures=None):
        self.nTrees = nTrees
        self.maxDepth = maxDepth
        self.minSplit = minSplit
        self.nFeatures = nFeatures
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        nSamples, nFeats = X.shape
        self.nFeatures = nFeats if not self.nFeatures else min(self.nFeatures, nFeats)

        for _ in range(self.nTrees):
            # Create a Decision Tree
            tree = DecisionTreeClassifier(
                max_depth=self.maxDepth,
                min_samples_split=self.minSplit,
                max_features=self.nFeatures,
            )

            # Bootstrap sampling
            idxs = np.random.choice(nSamples, nSamples, replace=True)
            X_sample, y_sample = X[idxs], y[idxs]

            # Train the tree on the bootstrap sample
            tree.fit(X_sample, y_sample)

            # Add the trained tree to the forest
            self.trees.append(tree)

    def predict(self, X):
        # Predict using each tree
        treePreds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([self._most_common_label(preds) for preds in treePreds.T])

    def _most_common_label(self, y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]

# Execution

Load the data

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data"
column_names = ["lettr", "x-box", "y-box", "width", "height", "onpix", "x-bar", "y-bar", "x2bar", "y2bar", "xybar", "x2ybr", "xy2br", "x-ege", "xegvy", "y-ege", "yegvx"]
data = pd.read_csv(url, header=None, names=column_names)

Preprocess the data

In [None]:
X = data.drop("lettr", axis=1).values
y = data["lettr"].values
le = LabelEncoder()
yEncoded = le.fit_transform(y)

Split the data and train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, yEncoded, test_size=0.3, random_state=42)
rf = RandomForest(nTrees=10, maxDepth=10, minSplit=2, nFeatures=5)
rf.fit(X_train, y_train)
yPred = rf.predict(X_test)

Evaluate the model

In [None]:
accuracy = np.mean(yPred == y_test)
print(f"Random Forest Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, yPred, target_names=le.classes_))

Random Forest Accuracy: 0.8017
Classification Report:
               precision    recall  f1-score   support

           A       0.99      0.93      0.96       232
           B       0.49      0.70      0.58       229
           C       0.91      0.81      0.86       201
           D       0.73      0.79      0.76       250
           E       0.82      0.63      0.71       238
           F       0.86      0.77      0.81       211
           G       0.67      0.86      0.75       230
           H       0.83      0.48      0.61       218
           I       1.00      0.79      0.88       221
           J       0.99      0.76      0.86       228
           K       0.60      0.81      0.69       188
           L       0.99      0.85      0.91       231
           M       0.96      0.87      0.91       252
           N       0.89      0.82      0.86       231
           O       0.73      0.78      0.75       218
           P       0.86      0.86      0.86       248
           Q       0.66   