# Sparse Polynomial Regression (degree â‰¤ 6)

This notebook fits a noisy dataset generated from a **sparse polynomial** (terms up to $x^6$).
We compare:
- Polynomial regression without regularisation
- Ridge regression (L2)
- Lasso regression (L1)

We split data into train/validation/test and tune hyperparameters using validation MSE.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset (same folder as this notebook)
path = 'random_poly_21.csv'
df = pd.read_csv(path)

X = df[[df.columns[0]]].values  # x
y = df[df.columns[1]].values    # y

# Train/Val/Test split (60/20/20)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print('Sizes:', len(X_train), len(X_val), len(X_test))


In [None]:
# Helper: polynomial pipeline (degree 6)
degree = 6

def make_pipe(model):
    return Pipeline([
        ('poly', PolynomialFeatures(degree=degree, include_bias=False)),
        ('scaler', StandardScaler()),
        ('model', model)
    ])

def validation_mse(pipe):
    pipe.fit(X_train, y_train)
    return mean_squared_error(y_val, pipe.predict(X_val))


In [None]:
# 1) Polynomial regression (no regularisation)
lin_pipe = make_pipe(LinearRegression())
lin_val_mse = validation_mse(lin_pipe)
print('Poly (no reg) validation MSE:', lin_val_mse)


In [None]:
# 2) Ridge regression: tune alpha on validation set
ridge_alphas = np.logspace(-6, 3, 40)
ridge_scores = []

for a in ridge_alphas:
    pipe = make_pipe(Ridge(alpha=a, random_state=42))
    ridge_scores.append((a, validation_mse(pipe)))

ridge_scores = np.array(ridge_scores, dtype=float)
best_ridge_alpha = ridge_scores[np.argmin(ridge_scores[:, 1]), 0]
print('Best Ridge alpha:', best_ridge_alpha)

plt.figure()
plt.plot(ridge_scores[:, 0], ridge_scores[:, 1], marker='o', markersize=3)
plt.xscale('log')
plt.xlabel('Ridge alpha')
plt.ylabel('Validation MSE')
plt.title('Ridge: validation MSE vs alpha')
plt.show()


In [None]:
# 3) Lasso regression: tune alpha on validation set
lasso_alphas = np.logspace(-6, 0, 40)
lasso_scores = []

for a in lasso_alphas:
    pipe = make_pipe(Lasso(alpha=a, max_iter=200000, random_state=42))
    lasso_scores.append((a, validation_mse(pipe)))

lasso_scores = np.array(lasso_scores, dtype=float)
best_lasso_alpha = lasso_scores[np.argmin(lasso_scores[:, 1]), 0]
print('Best Lasso alpha:', best_lasso_alpha)

plt.figure()
plt.plot(lasso_scores[:, 0], lasso_scores[:, 1], marker='o', markersize=3)
plt.xscale('log')
plt.xlabel('Lasso alpha')
plt.ylabel('Validation MSE')
plt.title('Lasso: validation MSE vs alpha')
plt.show()


In [None]:
# Refit best models on train+val and evaluate on the held-out test set
X_trainval = np.vstack([X_train, X_val])
y_trainval = np.concatenate([y_train, y_val])

best_lin = make_pipe(LinearRegression()).fit(X_trainval, y_trainval)
best_ridge = make_pipe(Ridge(alpha=float(best_ridge_alpha), random_state=42)).fit(X_trainval, y_trainval)
best_lasso = make_pipe(Lasso(alpha=float(best_lasso_alpha), max_iter=200000, random_state=42)).fit(X_trainval, y_trainval)

def report(name, model):
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f'{name}: Test MSE={mse:.6f}, Test R2={r2:.4f}')

report('Poly (no reg)', best_lin)
report('Ridge', best_ridge)
report('Lasso', best_lasso)

# Plot fitted curves with data
xx = np.linspace(X.min(), X.max(), 400).reshape(-1, 1)

plt.figure()
plt.scatter(X_train, y_train, label='Train')
plt.scatter(X_val, y_val, label='Val')
plt.scatter(X_test, y_test, label='Test')
plt.plot(xx, best_lin.predict(xx), label='Poly (no reg)')
plt.plot(xx, best_ridge.predict(xx), label='Ridge')
plt.plot(xx, best_lasso.predict(xx), label='Lasso')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Degree-6 polynomial fits')
plt.legend()
plt.show()


In [None]:
# Estimate polynomial coefficients (choose the most appropriate model)
# Choose Ridge for stable coefficient estimates under correlated polynomial features.
chosen = best_ridge

poly = chosen.named_steps['poly']
scaler = chosen.named_steps['scaler']
model = chosen.named_steps['model']

terms = poly.get_feature_names_out(['x'])

# Unscale coefficients back into original polynomial feature space:
# y = w_scaled * ((phi - mean)/std) + b_scaled
# => y = (w_scaled/std)*phi + (b_scaled - sum(w_scaled*mean/std))
w_scaled = model.coef_
b_scaled = model.intercept_

w = w_scaled / scaler.scale_
b = b_scaled - np.sum(w_scaled * scaler.mean_ / scaler.scale_)

coef_table = pd.DataFrame({'term': terms, 'coef': w})
coef_table.loc[len(coef_table)] = ['intercept', b]
print(coef_table)

plt.figure()
mask = coef_table.term != 'intercept'
plt.bar(coef_table.loc[mask, 'term'], coef_table.loc[mask, 'coef'])
plt.xticks(rotation=45, ha='right')
plt.ylabel('Estimated coefficient')
plt.title('Estimated polynomial coefficients (Ridge)')
plt.tight_layout()
plt.show()
