# Decision Tree

## Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt

print("Libraries imported!")

Libraries imported!


## Model Architecture

In [2]:
class DecisionTree():

    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)


    def _predict(self, X):
        node = self.tree_
        while node.left:
            if X[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        predicted_class = node.majority_class
        return predicted_class

    def _grow_tree(self, X, y, depth=0):
        samples_per_class = np.unique(y, return_counts=True)[1]
        majority_class = np.argmax(samples_per_class)

        node = Node(majority_class=majority_class)
        if depth < self.max_depth:
            feature_idx, threshold = self._best_split(X, y)
            if feature_idx is not None:
                left_indices = X[:, feature_idx] < threshold
                X_left, y_left = X[left_indices], y[left_indices]
                X_right, y_right = X[~left_indices], y[~left_indices]
                node.feature_idx = feature_idx
                node.threshold = threshold
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        
        return node
    
    def _best_split(self, X, y):
        m = y.size

        # if there is only one class in y, return None, None
        if m <= 1:
            return None, None
        
        # count of each class in y
        parent_class_count = np.unique(y, return_counts=True)[1]

        # initialize the best gini index with gini index of parent node
        best_gini = self._gini(y)

        # initialize the best feature and threshold with None, None
        best_feature_idx, best_threshold = None, None

        for feature_idx in range(self.n_features_):
            # Sort feature values and corresponding class labels
            thresholds, classes = zip(*sorted(zip(X[:, feature_idx], y)))

            for i in range(1, m):
                left_indices = X[:, feature_idx] < thresholds[i]
                y_left = y[left_indices]
                y_right = y[~left_indices]
                gini_left  = self._gini(y_left)
                gini_right = self._gini(y_right)
                weighted_gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue

            # Update best index and threshold if current Gini is lower
                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_idx = feature_idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2

        return best_idx, best_thr


    def _gini(self, y):
         # returns the count of each value in the array
        _, counts = np.unique(y, return_counts=True)
        # calculate the gini impurity
        impurity = 1 - np.sum(np.square(counts / len(y)))
        return impurity
    

class Node():
    
    def __init__(self, majority_class):
        self.majority_class = majority_class  
        self.feature_idx = None
        self.threshold = None
        self.left = None
        self.right = None


In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)

In [4]:
arr = np.array([1,1,1,1,2,2,3,3,3,4])
_, counts = np.unique(y_train, return_counts=True)
print(counts)

[35 39 38]


In [5]:
impurity = 1 - np.sum(np.square(counts / len(y_train)))
impurity

0.6659757653061225

In [6]:
n_classes = len(np.unique(y_train))
n_classes

3

In [7]:
num_samples_per_class = [np.sum(y_train == i) for i in range(n_classes)]
num_samples_per_class

[35, 39, 38]

In [8]:
X_train

array([[5. , 3.6, 1.4, 0.2],
       [5.2, 4.1, 1.5, 0.1],
       [5.8, 2.7, 5.1, 1.9],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [5.4, 3.9, 1.3, 0.4],
       [5.4, 3.7, 1.5, 0.2],
       [5.5, 2.4, 3.7, 1. ],
       [6.3, 2.8, 5.1, 1.5],
       [6.4, 3.1, 5.5, 1.8],
       [6.6, 3. , 4.4, 1.4],
       [7.2, 3.6, 6.1, 2.5],
       [5.7, 2.9, 4.2, 1.3],
       [7.6, 3. , 6.6, 2.1],
       [5.6, 3. , 4.5, 1.5],
       [5.1, 3.5, 1.4, 0.2],
       [7.7, 2.8, 6.7, 2. ],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 3.4, 1.4, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 2. , 3.5, 1. ],
       [6.3, 2.7, 4.9, 1.8],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [5.6, 2.7, 4.2, 1.3],
       [5.1, 3.4, 1.5, 0.2],
       [5.7, 3. , 4.2, 1.2],
       [7.7, 3.8, 6.7, 2.2],
       [4.6, 3.2, 1.4, 0.2],
       [6.2, 2.9, 4.3, 1.3],
       [5.7, 2.5, 5. , 2. ],
       [5.5, 4.2, 1.4, 0.2],
       [6. , 3

In [9]:
thresholds, classes = zip(*sorted(zip(X_train[:, 0], y_train)))
thresholds

(4.3,
 4.4,
 4.4,
 4.5,
 4.6,
 4.6,
 4.6,
 4.7,
 4.8,
 4.9,
 4.9,
 4.9,
 4.9,
 4.9,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.2,
 5.2,
 5.2,
 5.3,
 5.4,
 5.4,
 5.4,
 5.4,
 5.4,
 5.5,
 5.5,
 5.5,
 5.5,
 5.5,
 5.5,
 5.6,
 5.6,
 5.6,
 5.6,
 5.7,
 5.7,
 5.7,
 5.7,
 5.7,
 5.8,
 5.8,
 5.8,
 5.8,
 5.8,
 5.8,
 5.9,
 5.9,
 5.9,
 6.0,
 6.0,
 6.0,
 6.0,
 6.0,
 6.1,
 6.1,
 6.1,
 6.1,
 6.2,
 6.2,
 6.2,
 6.3,
 6.3,
 6.3,
 6.3,
 6.3,
 6.3,
 6.3,
 6.4,
 6.4,
 6.4,
 6.4,
 6.5,
 6.5,
 6.5,
 6.6,
 6.6,
 6.7,
 6.7,
 6.7,
 6.7,
 6.7,
 6.8,
 6.9,
 6.9,
 6.9,
 7.0,
 7.1,
 7.2,
 7.2,
 7.2,
 7.3,
 7.4,
 7.6,
 7.7,
 7.7,
 7.7)

In [10]:
h1, h2 = zip(*sorted(zip(X_train[:, 0], y_train)))
h1

(4.3,
 4.4,
 4.4,
 4.5,
 4.6,
 4.6,
 4.6,
 4.7,
 4.8,
 4.9,
 4.9,
 4.9,
 4.9,
 4.9,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.1,
 5.2,
 5.2,
 5.2,
 5.3,
 5.4,
 5.4,
 5.4,
 5.4,
 5.4,
 5.5,
 5.5,
 5.5,
 5.5,
 5.5,
 5.5,
 5.6,
 5.6,
 5.6,
 5.6,
 5.7,
 5.7,
 5.7,
 5.7,
 5.7,
 5.8,
 5.8,
 5.8,
 5.8,
 5.8,
 5.8,
 5.9,
 5.9,
 5.9,
 6.0,
 6.0,
 6.0,
 6.0,
 6.0,
 6.1,
 6.1,
 6.1,
 6.1,
 6.2,
 6.2,
 6.2,
 6.3,
 6.3,
 6.3,
 6.3,
 6.3,
 6.3,
 6.3,
 6.4,
 6.4,
 6.4,
 6.4,
 6.5,
 6.5,
 6.5,
 6.6,
 6.6,
 6.7,
 6.7,
 6.7,
 6.7,
 6.7,
 6.8,
 6.9,
 6.9,
 6.9,
 7.0,
 7.1,
 7.2,
 7.2,
 7.2,
 7.3,
 7.4,
 7.6,
 7.7,
 7.7,
 7.7)

In [11]:
xa = [0] * n_classes
xa

[0, 0, 0]

In [12]:
num_parent = [np.sum(y_train == c) for c in range(n_classes)]
num_parent

[35, 39, 38]

In [13]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([35, 39, 38]))