# 02 — Polynomial Regression (CV on train, fixed test set)
- Pipeline: StandardScaler → PolynomialFeatures → LinearRegression (or Ridge)
- Grid: degree ∈ {1,2,3}; optional Ridge α ∈ {0, 0.1, 1, 10}


In [None]:
import os, sys, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor

# Local utils
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))
from utils_data import load_concrete, train_test_split_fixed
from utils_plots import plot_pred_vs_actual, plot_residuals, report_mse

# Load frozen split if present, else create from source
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data')
train_path = os.path.join(DATA_DIR, 'concrete_train.csv')
test_path  = os.path.join(DATA_DIR, 'concrete_test.csv')

if os.path.exists(train_path) and os.path.exists(test_path):
    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)
else:
    df = load_concrete(local_first=True)
    train_df, test_df = train_test_split_fixed(df, seed=598)
    os.makedirs(DATA_DIR, exist_ok=True)
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

# Identify response column
y_col = None
for cand in ["concrete_compressive_strength_mpa", "concrete_compressive_strengt", "concrete_compressive_strength", "csmpa", "strength", "concrete_compressive_strength_(mpa)"]:
    if cand in train_df.columns:
        y_col = cand
        break
if y_col is None:
    y_col = train_df.columns[-1]

X_train = train_df.drop(columns=[y_col])
y_train = train_df[y_col].values
X_test  = test_df.drop(columns=[y_col])
y_test  = test_df[y_col].values

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # negative in GridSearchCV


In [None]:
# Build pipeline
use_ridge = False  # set True if instability appears
if use_ridge:
    estimator = Ridge()
    param_grid = {
        'poly__degree': [1,2,3],
        'est__alpha': [0.0, 0.1, 1, 10]
    }
else:
    estimator = LinearRegression()
    param_grid = {
        'poly__degree': [1,2,3]
    }

pipe = Pipeline([
    ('scale', StandardScaler(with_mean=True, with_std=True)),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('est', estimator)
])

cv = KFold(n_splits=5, shuffle=True, random_state=598)
gcv = GridSearchCV(pipe, param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, refit=True)
gcv.fit(X_train, y_train)

print('Best params:', gcv.best_params_)
print('CV MSE:', -gcv.best_score_)

# Evaluate
y_pred_test = gcv.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print('Test MSE:', test_mse)

fig_dir = os.path.join(os.path.dirname(os.getcwd()), 'reports', 'figures')
os.makedirs(fig_dir, exist_ok=True)
plot_pred_vs_actual(y_test, y_pred_test, title='Poly — Predicted vs Actual', save_path=os.path.join(fig_dir, 'poly_pv.png'))
plot_residuals(y_test, y_pred_test, title='Poly — Residuals vs Fitted', save_path=os.path.join(fig_dir, 'poly_resid.png'))
