In [26]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        
    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)
        
    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
        
    def _gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        impurity = 1 - np.sum([(count / len(y)) ** 2 for count in counts])
        return impurity
        
    def _best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        
        print("X: ",len(X))
        print("y: ",len(y))
        
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        
        for idx in range(self.n_features_):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        
        return best_idx, best_thr
        
    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                node.num_samples = len(y)
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
            
        return node
        
    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        print(f"prediction: {node.predicted_class}")
        return node.predicted_class
    
    def traverse_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree_
        if node.is_leaf_node():
            print("  " * depth + f"Leaf Node: Majority class = {node.predicted_class}")
        else:
            print("  " * depth + f"Node: Majority class = {node.predicted_class}")
            if node.feature_index is not None:
                print("  " * depth + f"  Split on feature {node.feature_index} with threshold {node.threshold}")
                print("  " * depth + "  Left:")
                self.traverse_tree(node.left, depth + 1)
                print("  " * depth + "  Right:")
                self.traverse_tree(node.right, depth + 1)
        print("Number of samples in node: {}".format(node.num_samples))
    
class Node:
    def __init__(self, *, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0.0 
        self.left = None
        self.right = None
        self.num_samples = None

    def is_leaf_node(self):
        return self.left is None and self.right is None

In [27]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree
tree = DecisionTree(max_depth=2)
tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree.predict(X_test)

# Compute the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

X:  120
y:  120
X:  40
y:  40
X:  80
y:  80
prediction: 1
prediction: 0
prediction: 2
prediction: 1
prediction: 2
prediction: 0
prediction: 1
prediction: 2
prediction: 1
prediction: 1
prediction: 2
prediction: 0
prediction: 0
prediction: 0
prediction: 0
prediction: 1
prediction: 2
prediction: 1
prediction: 1
prediction: 2
prediction: 0
prediction: 2
prediction: 0
prediction: 2
prediction: 2
prediction: 2
prediction: 2
prediction: 2
prediction: 0
prediction: 0
Accuracy: 0.9666666666666667


In [28]:
np.unique(y_pred)

array([0, 1, 2])

In [29]:
tree.traverse_tree()

Node: Majority class = 1
  Split on feature 2 with threshold 2.45
  Left:
  Leaf Node: Majority class = 0
Number of samples in node: None
  Right:
  Node: Majority class = 1
    Split on feature 2 with threshold 4.75
    Left:
    Leaf Node: Majority class = 1
Number of samples in node: None
    Right:
    Leaf Node: Majority class = 2
Number of samples in node: None
Number of samples in node: 80
Number of samples in node: 120
