## Pseudo code: Decision Tree

```
function build_tree(data, labels, depth=0):
    if stop_condition(data, labels, depth):
        return LeafNode(class = majority_class(labels))
    
    best_gain = 0
    best_feat, best_thresh = None, None
    parent_impurity = impurity(labels)

    for each feature j in 1…D:
        for each threshold t in unique_values(data[:,j]):
            left_labels  = labels[data[:,j] ≤ t]
            right_labels = labels[data[:,j] >  t]

            if len(left_labels)==0 or len(right_labels)==0: continue

            gain = parent_impurity \
                   - (|left|/|total|)*impurity(left_labels) \
                   - (|right|/|total|)*impurity(right_labels)

            if gain > best_gain:
                best_gain, best_feat, best_thresh = gain, j, t

    if best_gain < min_impurity_decrease:
        return LeafNode(class = majority_class(labels))

    left_data, left_labels  = split(data, labels, best_feat, best_thresh, side="left")
    right_data, right_labels = split(data, labels, best_feat, best_thresh, side="right")

    left_subtree  = build_tree(left_data,  left_labels,  depth+1)
    right_subtree = build_tree(right_data, right_labels, depth+1)

    return DecisionNode(
        feature_index = best_feat,
        threshold     = best_thresh,
        left          = left_subtree,
        right         = right_subtree
    )

```

In [6]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from mlcore.decision_tree import CustomDecisionTreeClassifier

In [7]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy
X, y = datasets.make_classification(n_samples=10000, n_features=10, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [8]:
tree = CustomDecisionTreeClassifier(max_depth=5, min_samples_split=2, criterion="gini")
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

print("Accuracy:", accuracy(y_test, y_pred))

Accuracy: 0.8875


In [9]:
sk_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=2, criterion="gini")
sk_tree.fit(X_train, y_train)
y_sk_pred = sk_tree.predict(X_test)

print("Sklearn Accuracy:", accuracy(y_test, y_sk_pred))

Sklearn Accuracy: 0.8875


In [10]:
CustomDecisionTreeClassifier.print_tree(tree)

Feature[5] ≤ 0.0018  |  Gain=0.2738
→ True branch:
    Feature[2] ≤ 0.5552  |  Gain=0.0284
    → True branch:
        Feature[0] ≤ 1.1331  |  Gain=0.0464
        → True branch:
            Feature[0] ≤ 0.6642  |  Gain=0.1211
            → True branch:
                Feature[0] ≤ -1.8437  |  Gain=0.0203
                → True branch:
                    Predict: 0 (samples=44)
                → False branch:
                    Predict: 1 (samples=615)
            → False branch:
                Feature[9] ≤ 1.7273  |  Gain=0.0178
                → True branch:
                    Predict: 0 (samples=255)
                → False branch:
                    Predict: 1 (samples=13)
        → False branch:
            Feature[7] ≤ -0.8383  |  Gain=0.0017
            → True branch:
                Feature[7] ≤ -0.8816  |  Gain=0.0534
                → True branch:
                    Predict: 1 (samples=40)
                → False branch:
                    Predict: 0 (samples=3)
        