# 05 — Regression Tree with CCP Pruning
- Get CCP path → CV over ccp_alpha → select min-CV/1-SE; evaluate on test


In [None]:
import os, sys, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor

# Local utils
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))
from utils_data import load_concrete, train_test_split_fixed
from utils_plots import plot_pred_vs_actual, plot_residuals, report_mse

# Load frozen split if present, else create from source
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data')
train_path = os.path.join(DATA_DIR, 'concrete_train.csv')
test_path  = os.path.join(DATA_DIR, 'concrete_test.csv')

if os.path.exists(train_path) and os.path.exists(test_path):
    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)
else:
    df = load_concrete(local_first=True)
    train_df, test_df = train_test_split_fixed(df, seed=598)
    os.makedirs(DATA_DIR, exist_ok=True)
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

# Identify response column
y_col = None
for cand in ["concrete_compressive_strength_mpa", "concrete_compressive_strengt", "concrete_compressive_strength", "csmpa", "strength", "concrete_compressive_strength_(mpa)"]:
    if cand in train_df.columns:
        y_col = cand
        break
if y_col is None:
    y_col = train_df.columns[-1]

X_train = train_df.drop(columns=[y_col])
y_train = train_df[y_col].values
X_test  = test_df.drop(columns=[y_col])
y_test  = test_df[y_col].values

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # negative in GridSearchCV


In [None]:
tree0 = DecisionTreeRegressor(random_state=598)
tree0.fit(X_train, y_train)
path = tree0.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_grid = {'ccp_alpha': [a for a in ccp_alphas[:-1]] if len(ccp_alphas)>1 else [0.0]}

cv = KFold(n_splits=5, shuffle=True, random_state=598)
tree = DecisionTreeRegressor(random_state=598)
gcv = GridSearchCV(tree, ccp_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, refit=True)
gcv.fit(X_train, y_train)
print("Best alpha:", gcv.best_params_)
print("CV MSE:", -gcv.best_score_)

y_pred_test = gcv.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print("Test MSE:", test_mse)

fig_dir = os.path.join(os.path.dirname(os.getcwd()), 'reports', 'figures')
os.makedirs(fig_dir, exist_ok=True)
plot_pred_vs_actual(y_test, y_pred_test, title='Tree — Predicted vs Actual', save_path=os.path.join(fig_dir, 'tree_pv.png'))
plot_residuals(y_test, y_pred_test, title='Tree — Residuals vs Fitted', save_path=os.path.join(fig_dir, 'tree_resid.png'))

# quick tree visualization (top levels)
plt.figure(figsize=(12,6))
try:
    plot_tree(gcv.best_estimator_, feature_names=X_train.columns if hasattr(X_train, 'columns') else None, filled=True, max_depth=3)
except Exception as e:
    plot_tree(gcv.best_estimator_, filled=True, max_depth=3)
plt.title("Pruned Tree (top levels)")
plt.tight_layout()
plt.show()
