In [4]:
import numpy as np
import pandas as pd

# Decision Tree Classifier
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if n_classes == 1 or n_samples == 0 or (self.max_depth and depth >= self.max_depth):
            return {"type": "leaf", "value": self._most_common_label(y)}

        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            return {"type": "leaf", "value": self._most_common_label(y)}

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {
            "type": "node",
            "feature": best_feature,
            "threshold": best_threshold,
            "left": left_subtree,
            "right": right_subtree,
        }

    def _best_split(self, X, y):
        best_gini = float("inf")
        best_feature, best_threshold = None, None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gini = self._gini_impurity(X[:, feature], y, threshold)
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _gini_impurity(self, feature_values, y, threshold):
        left_mask = feature_values <= threshold
        right_mask = feature_values > threshold

        left_labels = y[left_mask]
        right_labels = y[right_mask]

        left_gini = 1.0 - sum((np.sum(left_labels == c) / len(left_labels)) ** 2 for c in np.unique(y)) if len(left_labels) > 0 else 0
        right_gini = 1.0 - sum((np.sum(right_labels == c) / len(right_labels)) ** 2 for c in np.unique(y)) if len(right_labels) > 0 else 0

        total_samples = len(y)
        gini = (len(left_labels) / total_samples) * left_gini + (len(right_labels) / total_samples) * right_gini

        return gini

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, tree):
        if tree["type"] == "leaf":
            return tree["value"]
        if sample[tree["feature"]] <= tree["threshold"]:
            return self._predict_sample(sample, tree["left"])
        else:
            return self._predict_sample(sample, tree["right"])

# Driver Code
def main():
    train_file = "multi_classification_train.csv"
    test_file = "multi_classification_test.csv"

    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    # Drop the ID column
    X_train = df_train.iloc[:, 1:-1].values
    y_train = df_train.iloc[:, -1].values
    X_test = df_test.iloc[:, 1:].values

    model = DecisionTree(max_depth=3)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)  

    print("Predicted classes for training data:", y_pred_train)
    print("Predicted classes for test data:", y_pred_test)

    # Calculate precision, recall, and F1-score for training data
    def calculate_metrics(y_true, y_pred):
        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        return precision, recall, f1_score

    precision, recall, f1_score = calculate_metrics(y_train, y_pred_train)
    print("Precision on training data: %0.2f" % precision)
    print("Recall on training data: %0.2f" % recall)
    print("F1-score on training data: %0.2f" % f1_score)

if __name__ == "__main__":
    main()


Predicted classes for training data: [4 2 4 ... 3 2 1]
Predicted classes for test data: [3 1 1 ... 1 1 3]
Precision on training data: 0.96
Recall on training data: 1.00
F1-score on training data: 0.98
