# Smoothing Splines Notebook
- LinearGAM with s(x) for each numeric predictor and .gridsearch for smoothing


In [3]:
# pip install pygam

In [23]:
import os, sys, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pygam import LinearGAM, s
from sklearn.metrics import mean_squared_error

In [13]:
# local utils
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))
from utils_plots import plot_pred_vs_actual, plot_residuals, report_mse

# load train/test data
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data')
train_path = os.path.join(DATA_DIR, 'concrete_train.csv')
test_path  = os.path.join(DATA_DIR, 'concrete_test.csv')

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# split X/y
X_train = train_df.drop(columns=['concrete_compressive_strength']).to_numpy(dtype=float)
y_train = train_df['concrete_compressive_strength'].values
X_test  = test_df.drop(columns=['concrete_compressive_strength']).to_numpy(dtype=float)
y_test  = test_df['concrete_compressive_strength'].values

In [27]:
n_features = X_train.shape[1]
terms = s(0)
for i in range(1, n_features):
    terms += s(i)

gam = LinearGAM(terms).gridsearch(X_train, y_train)

InvalidIndexError: (slice(None, None, None), 0)

In [None]:
y_pred_test = gam.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", test_mse)

In [None]:
fig_dir = os.path.join(os.path.dirname(os.getcwd()), 'reports', 'figures')
os.makedirs(fig_dir, exist_ok=True)
plot_pred_vs_actual(y_test, y_pred_test, title='GAM — Predicted vs Actual', save_path=os.path.join(fig_dir, 'gam_pv.png'))
plot_residuals(y_test, y_pred_test, title='GAM — Residuals vs Fitted', save_path=os.path.join(fig_dir, 'gam_resid.png'))

In [None]:
# Shape plots
for i in range(n_features):
    plt.figure()
    XX = gam.generate_X_grid(term=i)
    plt.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
    plt.title(f'GAM shape for feature {i}')
    plt.tight_layout()
    plt.show()