In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

boston = pd.read_csv("../../data/Boston.csv").rename({"Unnamed: 0": "Id"}, axis=1).set_index("Id")
boston.head()

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [8]:
X = boston.drop('crim', axis=1)
y = boston['crim']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')

Train shape: (354, 12), Test shape: (152, 12)


### Best Subset Selection

In [9]:
from itertools import combinations
from sklearn.base import clone

def best_subset_selection(X, y, max_features=5):
    results = []
    for k in range(1, max_features+1):
        for combo in combinations(X.columns, k):
            X_subset = X[list(combo)]
            model = LinearRegression()
            scores = cross_val_score(model, X_subset, y, cv=5, scoring='neg_mean_squared_error')
            results.append({'features': combo, 'score': -scores.mean()})
    best = min(results, key=lambda x: x['score'])
    return best, results

# Limit to 5 features for computational reasons
best, all_results = best_subset_selection(X_train, y_train, max_features=5)
print('Best subset:', best['features'])
print('CV MSE:', best['score'])

Best subset: ('zn', 'nox', 'dis', 'rad', 'lstat')
CV MSE: 39.93106567184801


### Lasso Regression

In [10]:
# Standardize predictors for Lasso
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso = LassoCV(cv=5, random_state=1)
lasso.fit(X_train_scaled, y_train)

print('Best alpha:', lasso.alpha_)
print('Number of nonzero coefficients:', np.sum(lasso.coef_ != 0))

y_pred_lasso = lasso.predict(X_test_scaled)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print('Test MSE (Lasso):', mse_lasso)

Best alpha: 0.023741910636735197
Number of nonzero coefficients: 11
Test MSE (Lasso): 50.23534559282834


### Ridge Regression

In [11]:
ridge = RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5)
ridge.fit(X_train_scaled, y_train)

print('Best alpha:', ridge.alpha_)

y_pred_ridge = ridge.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print('Test MSE (Ridge):', mse_ridge)

Best alpha: 35.111917342151344
Test MSE (Ridge): 50.29278805173746


### Principal Components Regression (PCR)

In [12]:
# PCR: Find optimal number of components by cross-validation
mse_pcr = []
n_components = X_train.shape[1]
for k in range(1, n_components+1):
    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    model = LinearRegression()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    mse = mean_squared_error(y_test, y_pred)
    mse_pcr.append(mse)

best_k = np.argmin(mse_pcr) + 1
print('Best number of components:', best_k)
print('Test MSE (PCR):', mse_pcr[best_k-1])

Best number of components: 12
Test MSE (PCR): 50.51047624630783


Among the methods tested to predict **crim** in the Boston dataset, best subset selection achieved the lowest cross-validated MSE (≈39.93) using only five predictors (**zn**, **nox**, **dis**, **rad**, **lstat**), indicating strong performance with a simple model. Lasso and ridge regression both produced similar test MSEs (≈50.24 and ≈50.29, respectively), with lasso selecting a sparse set of 11 coefficients. PCR, using 12 components, performed slightly worse (≈50.51). Overall, best subset selection outperformed the regularized and dimensionality reduction approaches in this setting, offering both accuracy and interpretability.