# Decision Trees — Student Lab

We start using **sklearn** in Week 4, but you’ll still implement core pieces from scratch.

In [1]:
# Decision Tree = It's ML algorithm that splits data into branches to make predictions based on feature values.

import numpy as np

def check(name: str, cond: bool):
    if not cond:
        raise AssertionError(f'Failed: {name}')
    print(f'OK: {name}')

rng = np.random.default_rng(0)

## Section 0 — Synthetic dataset
We’ll create a non-linear boundary dataset to show how trees fit.

In [2]:
# Here creating a data that cannot be separated by straight line
def make_nonlinear(n=400):
    X = rng.uniform(-2, 2, size=(n, 2))
    # circle boundary
    r = np.sqrt(X[:,0]**2 + X[:,1]**2) # cretaed a circular boundry
    y = (r < 1.0).astype(int) # everything which is less than y , put it in y
    # add noise
    flip = rng.random(n) < 0.05
    y[flip] = 1 - y[flip] # flip the labels of 5% of the data to add noise
    return X, y

X, y = make_nonlinear()
n = X.shape[0]
idx = rng.permutation(n) # shuffle the indexes of the data
tr = idx[: int(0.7*n)] # take the first 70% of the data for training
va = idx[int(0.7*n):] # take the remaining 30% of the data for validation
Xtr, ytr = X[tr], y[tr]
Xva, yva = X[va], y[va]
check('shapes', Xtr.shape[0]==ytr.shape[0] and Xva.shape[0]==yva.shape[0])

OK: shapes


## Section 1 — Impurity

### Task 1.1: Gini impurity

# TODO: implement gini(y)
# HINT: p_k = count_k / n; gini = 1 - sum(p_k^2)


In [3]:
def gini(y):
    # TODO
    y = np.asarray(y, dtype=int)
    if y.size == 0:
        return 0.0
    p1 = y.mean()
    p0 = 1 - p1
    return float(1 -( p0**2 + p1**2)) 

check('gini_pure0', abs(gini(np.zeros(10, dtype=int))) < 1e-12)
check('gini_half', abs(gini(np.array([0,1]*5)) - 0.5) < 1e-12)

OK: gini_pure0
OK: gini_half


### Task 1.2: Entropy

# TODO: implement entropy(y)
# HINT: entropy = -sum p log2 p (use eps)


In [4]:
# entropy : measure of uncertainty in a set of labels
# ex : how mixed are the labels?
# if low entropy, that means most labels are the same
# if high entropy, that means labels are evenly mixed
# how good the split is? is done by gini or entropy
def entropy(y):
    # TODO
    y = np.asarray(y, dtype=int)
    if y.size == 0:
        return 0.0
    p1 = y.mean() # probability of label 1
    p0 = 1 - p1 # probability of label 0
    ent = 0.0 # initialize entropy to 0
    if p0 > 0: 
        ent -= p0 * np.log2(p0) # if p0 is greater than 0, then calculate the contribution of label 0 to the entropy
    if p1 > 0:
        ent -= p1 * np.log2(p1) # if p1 is greater than 0, then calculate the contribution of label 1 to the entropy
    return float(ent)

print(abs(entropy(np.zeros(10, dtype=int))) < 1e-9)
    

check('entropy_pure0', abs(entropy(np.zeros(10, dtype=int))) < 1e-12)
check('entropy_half', abs(entropy(np.array([0,1]*5)) - 1.0) < 1e-9)

True
OK: entropy_pure0
OK: entropy_half


## Section 2 — Best split (decision stump)

### Task 2.1: Evaluate impurity after threshold split

Split rule: go left if X[:,j] <= t else right.
Return weighted impurity and information gain.


In [5]:
# Big dataset -> then use gini, small dataset -> use entropy
# Gini impurity : if i pick a random label from the set, what is the probability that it is misclassified?

def split_indices(X, j, t):
    left = np.where(X[:, j] <= t)[0]
    right = np.where(X[:, j] > t)[0]
    return left, right

def info_gain(y, y_left, y_right, criterion='gini'):
    # TODO
    f = gini if criterion == 'gini' else entropy
    parent = f(y) # impurity of the parent node
    n = y.size # total number of samples in the parent node
    w1 = (y_left.size / n) * f(y_left) # weighted impurity of the left child
    w2 = (y_right.size / n) * f(y_right) # weighted impurity of the right child
    return float(parent - (w1 + w2)) # information gain is the difference between the impurity of the parent and the weighted impurity of the children

# quick sanity
y0 = np.array([0,0,1,1])
gain = info_gain(y0, np.array([0,0]), np.array([1,1]), criterion='gini')
check('gain_positive', gain > 0)

OK: gain_positive


### Task 2.2: Find best (feature, threshold)

# TODO: implement best_split(X, y)
# HINT: thresholds from sorted unique feature values midpoints

**FAANG gotcha:** if a split makes an empty child, skip it.

In [6]:
# Here trying to find the best split for the data
def best_split(X, y, criterion='gini'): 
    # TODO: return (best_j, best_t, best_gain)
    best = (-1,None,-1.0) # (feature index, threshold, gain)
    n, d = X.shape # number of samples, number of features
    for j in range(d):
        vals = np.unique(X[:, j]) # unique values of the feature j
        if vals.size < 2: # if there are less than 2 unique values, then we cannot split on this feature
            continue
        thresholds = (vals[:-1] + vals[1:]) / 2 # potential thresholds are the midpoints between unique values
        for t in thresholds:
            left = X[:, j] <= t # boolean array for left split
            right = ~left # boolean array for right split
            if left.sum() == 0 or right.sum() == 0: # if either split is empty, then skip this threshold
                continue
            gain = info_gain(y, y[left], y[right], criterion = criterion) # calculate the information gain for this split
            if gain > best[2]:
                best = (j, float(t), gain) # update the best split if this gain is better than the best gain so far
    return best

j, t, gain = best_split(Xtr, ytr)
print('best', j, t, gain)
check('gain_nonneg', gain >= 0)

best 0 1.026601307828603 0.03378155108197134
OK: gain_nonneg


### Task 2.3: Train a stump and evaluate

Use best_split to build a stump that predicts majority class on each side.


In [7]:
def stump_predict(X_train, y_train, X_test, criterion='gini'):
    # TODO
    j, t, _ = best_split(X_train, y_train, criterion=criterion) # find the best split on the training data
    left = X_train[:, j] <= t # boolean array for left split
    right = ~left # boolean array for right split
    left_label = int(np.round(y_train[left].mean()) )# majority label for left split , how many 1s are there in the left split, if more than 0.5, then label is 1, otherwise label is 0
    right_label = int(np.round(y_train[right].mean()) )# majority label for right split
    test_left = X_test[:, j] <= t # boolean array for test samples that fall into the left split
    yhat = np.empty(X_test.shape[0], dtype=int) # initialize predictions to
    yhat[test_left] = left_label # assign the left label to the test samples that fall into the left split
    yhat[~test_left] = right_label # assign the right label to the test samples that fall into the right split
    return yhat

# accuracy : measure of how many predictions are correct
def accuracy(y, yhat):
    return float(np.mean(y == yhat))

yhat_tr = stump_predict(Xtr, ytr, Xtr)
yhat_va = stump_predict(Xtr, ytr, Xva)
print('stump train acc', accuracy(ytr, yhat_tr))
print('stump val acc', accuracy(yva, yhat_va))

stump train acc 0.7678571428571429
stump val acc 0.75


## Section 3 — sklearn DecisionTreeClassifier (sanity check)

### Task 3.1: Train trees with different max_depth

# TODO: train sklearn tree and compare train/val accuracy for depth in [1,2,3,5,None].


In [8]:
from sklearn.tree import DecisionTreeClassifier

depths = [1,2,3,5,None]
for md in depths:
    clf = DecisionTreeClassifier(max_depth=md, random_state=0) 
    clf.fit(Xtr, ytr)
    tr_acc = clf.score(Xtr, ytr)
    va_acc = clf.score(Xva, yva)
    print('max_depth', md, 'train', tr_acc, 'val', va_acc)

max_depth 1 train 0.7678571428571429 val 0.75
max_depth 2 train 0.7678571428571429 val 0.75
max_depth 3 train 0.8428571428571429 val 0.7666666666666667
max_depth 5 train 0.9642857142857143 val 0.8416666666666667
max_depth None train 1.0 val 0.825


## Section 4 — Failure mode: leakage

### Task 4.1: Create a leaky feature
Add a feature that is directly derived from y and watch validation accuracy jump.

**Explain:** why do trees exploit leakage aggressively?

In [9]:

Xtr_leak = np.hstack([Xtr, ytr.reshape(-1,1)])
Xva_leak = np.hstack([Xva, yva.reshape(-1,1)])

clf = DecisionTreeClassifier(max_depth=3, random_state=0)
clf.fit(Xtr_leak, ytr)
print('val acc with leakage', clf.score(Xva_leak, yva))

val acc with leakage 1.0


---
## Submission Checklist
- All TODOs completed
- Stump implemented
- sklearn depth sweep shown
- Leakage demo explained
