### Step 1: Define X and y

In [None]:
# Load example dataset
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)  # X = features, y = labels

### Step 2: Split into Train and Test sets

In [None]:
# Split dataset to train the model and keep some data for unbiased testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Step 3: Fit full (unpruned) decision tree

In [None]:
# Fit a full tree (i.e., grow it completely) before any pruning
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

### Step 4: Get cost-complexity pruning path

In [None]:
# Find all `ccp_alpha` values where the tree structure changes — used for pruning
path = tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

### Step 5: Train a tree for each ccp_alpha

In [None]:
# Train multiple pruned trees to evaluate how `ccp_alpha` affects performance
trees = []
for alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=alpha)
    clf.fit(X_train, y_train)
    trees.append(clf)

### Step 6: Plot accuracy vs ccp_alpha

In [None]:
# Visualize how pruning impacts overfitting vs underfitting
import matplotlib.pyplot as plt
train_scores = [clf.score(X_train, y_train) for clf in trees]
test_scores = [clf.score(X_test, y_test) for clf in trees]

plt.plot(ccp_alphas, train_scores, label='Train')
plt.plot(ccp_alphas, test_scores, label='Test')
plt.xlabel('ccp_alpha')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Pruning Level')
plt.legend()
plt.grid(True)
plt.show()

### Step 7: Select best ccp_alpha

In [None]:
# Pick the `ccp_alpha` value that yields the best validation performance
import numpy as np
best_index = np.argmax(test_scores)
best_alpha = ccp_alphas[best_index]

### Step 8: Train final pruned tree

In [None]:
# Use the best pruning level to train a tree that generalizes well
final_tree = DecisionTreeClassifier(random_state=0, ccp_alpha=best_alpha)
final_tree.fit(X_train, y_train)
print("Final accuracy:", final_tree.score(X_test, y_test))

### Step 9 (Optional): Use pruned tree in ensemble

In [None]:
# In practice, RF trees are not pruned, but you can try this if needed
from sklearn.ensemble import RandomForestClassifier
base_tree = DecisionTreeClassifier(ccp_alpha=best_alpha, random_state=0)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

### Step 10: Tune n_estimators with CV

In [None]:
# Use CV to find the number of trees that performs best — no overfitting here
from sklearn.model_selection import cross_val_score
for n in [50, 100, 150, 200]:
    rf = RandomForestClassifier(n_estimators=n, random_state=0)
    scores = cross_val_score(rf, X_train, y_train, cv=5)
    print(f"n={n} → Mean CV Accuracy: {scores.mean():.4f}")

### Step 11: Final model evaluation on test set

In [None]:
# Now test on unseen data for true generalization performance
best_rf = RandomForestClassifier(n_estimators=100, random_state=0)
best_rf.fit(X_train, y_train)
print("Test accuracy:", best_rf.score(X_test, y_test))