# 2.1 Code Brief: Introduction to Decision Trees

Quick reference for decision tree concepts and visualizations.

## Setup

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

## Gini Impurity Visualization

In [None]:
p = np.linspace(0, 1, 100)
gini = 1 - p**2 - (1-p)**2

fig = go.Figure()
fig.add_trace(go.Scatter(x=p, y=gini, mode='lines', line=dict(color='blue', width=3), name='Gini Impurity'))
fig.add_trace(go.Scatter(x=[0, 0.5, 1], y=[0, 0.5, 0], mode='markers', marker=dict(size=12, color='red'),
                         text=['Pure (all E)', 'Max Impurity', 'Pure (all N)'], textposition='top center'))
fig.update_layout(title='Gini Impurity vs. Class Proportion', xaxis_title='Proportion of Class N', yaxis_title='Gini Impurity', height=400)
fig.show()

## Gini vs Entropy Comparison

In [None]:
p = np.linspace(0.001, 0.999, 100)
gini = 1 - p**2 - (1-p)**2
entropy = -p * np.log2(p) - (1-p) * np.log2(1-p)

fig = go.Figure()
fig.add_trace(go.Scatter(x=p, y=gini, mode='lines', line=dict(color='blue', width=3), name='Gini Impurity'))
fig.add_trace(go.Scatter(x=p, y=entropy, mode='lines', line=dict(color='orange', width=3), name='Entropy'))
fig.update_layout(title='Gini Impurity vs. Entropy', xaxis_title='Proportion of Class N', yaxis_title='Impurity Measure', height=400)
fig.show()

## Evaluate Split Thresholds

In [None]:
sample_data = pd.DataFrame({
    'GPA': [3.5, 2.8, 1.9, 3.2, 2.1, 3.8, 2.5, 1.5],
    'DFW_Rate': [0.0, 0.2, 0.5, 0.1, 0.4, 0.0, 0.3, 0.6],
    'Enrolled': ['E', 'E', 'N', 'E', 'N', 'E', 'E', 'N']
})

p_enrolled = (sample_data['Enrolled'] == 'E').mean()
gini_original = 1 - p_enrolled**2 - (1-p_enrolled)**2
print(f"Original Gini Impurity: {gini_original:.3f}")
print(sample_data)

## Decision Tree vs Logistic Regression Boundaries

In [None]:
np.random.seed(42)
n_samples = 200
X1_class0 = np.random.multivariate_normal([3, 0.15], [[0.3, 0], [0, 0.01]], n_samples//2)
X1_class1 = np.random.multivariate_normal([2, 0.35], [[0.5, 0], [0, 0.015]], n_samples//2)
X_demo = np.vstack([X1_class0, X1_class1])
y_demo = np.array([0]*100 + [1]*100)
X_demo[:, 0] = np.clip(X_demo[:, 0], 0, 4)
X_demo[:, 1] = np.clip(X_demo[:, 1], 0, 1)

dt = DecisionTreeClassifier(max_depth=3, random_state=42).fit(X_demo, y_demo)
lr = LogisticRegression(random_state=42).fit(X_demo, y_demo)

xx, yy = np.meshgrid(np.linspace(0, 4, 100), np.linspace(0, 0.7, 100))
mesh_points = np.c_[xx.ravel(), yy.ravel()]
Z_dt = dt.predict_proba(mesh_points)[:, 1].reshape(xx.shape)
Z_lr = lr.predict_proba(mesh_points)[:, 1].reshape(xx.shape)

fig = make_subplots(rows=1, cols=2, subplot_titles=('Decision Tree', 'Logistic Regression'))
for col, Z in enumerate([Z_dt, Z_lr], 1):
    fig.add_trace(go.Contour(x=np.linspace(0, 4, 100), y=np.linspace(0, 0.7, 100), z=Z, colorscale='RdBu', showscale=False, opacity=0.6), row=1, col=col)
    fig.add_trace(go.Scatter(x=X_demo[y_demo==0, 0], y=X_demo[y_demo==0, 1], mode='markers', marker=dict(color='blue', size=6), showlegend=(col==1), name='Enrolled'), row=1, col=col)
    fig.add_trace(go.Scatter(x=X_demo[y_demo==1, 0], y=X_demo[y_demo==1, 1], mode='markers', marker=dict(color='red', size=6), showlegend=(col==1), name='Not Enrolled'), row=1, col=col)
fig.update_layout(height=400, title_text='Decision Boundaries Comparison')
fig.show()

## Key Concepts

| Concept | Description |
|:--------|:------------|
| Gini Impurity | Probability of misclassification; 0 = pure |
| Entropy | Information uncertainty; 0 = pure |
| max_depth | Controls tree complexity |
| Overfitting | Trees without constraints memorize data |