Chapter 6.

2. a) answer(iii) -> Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

The LASSO adds an L1 penalty, which shrinks some coefficients to exactly zero, leading to sparser models.
This reduces model flexibility compared to least squares, because fewer variables are used.

b) answer(iii) -> Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.
Ridge regression applies an L2 penalty, shrinking coefficients towards zero but never exactly zero.
This also reduces model flexibility, increasing bias but reducing variance.

c) answer(i) -> More flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance.

Non-linear methods (e.g., splines, decision trees, neural networks) are generally more flexible than linear least squares because they can capture complex relationships between features and outcomes.
More flexibility tends to reduce bias (since the model can adapt to complex patterns) but often increases variance (susceptibility to overfitting).

In [None]:
%pip install ISLP

In [39]:
from ISLP import load_data
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold

In [2]:
college = load_data('College')

In [3]:
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [13]:
college_encoded = pd.get_dummies(college, columns=['Private'], drop_first=True)

x = college_encoded.drop('Private_Yes', axis=1)
y = college_encoded['Private_Yes']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [14]:
model = LinearRegression()
model.fit(x_train, y_train)

In [15]:
sfs = sm.OLS(y_train, x_train).fit()
sfs.summary()

0,1,2,3
Dep. Variable:,Private_Yes,R-squared (uncentered):,-49.732
Model:,OLS,Adj. R-squared (uncentered):,-51.16
Method:,Least Squares,F-statistic:,-34.83
Date:,"Sat, 19 Jul 2025",Prob (F-statistic):,1.0
Time:,15:41:45,Log-Likelihood:,-103.43
No. Observations:,621,AIC:,240.9
Df Residuals:,604,BIC:,316.2
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Apps,-3.66e-05,1.08e-05,-3.375,0.001,-5.79e-05,-1.53e-05
Accept,2.762e-05,2.15e-05,1.287,0.199,-1.45e-05,6.98e-05
Enroll,4.695e-05,6.07e-05,0.773,0.440,-7.23e-05,0.000
Top10perc,-0.0008,0.002,-0.464,0.643,-0.004,0.003
Top25perc,0.0013,0.001,0.956,0.340,-0.001,0.004
F.Undergrad,-3.712e-05,1.08e-05,-3.422,0.001,-5.84e-05,-1.58e-05
P.Undergrad,-8.038e-06,1.24e-05,-0.651,0.515,-3.23e-05,1.62e-05
Outstate,4.475e-05,5.88e-06,7.612,0.000,3.32e-05,5.63e-05
Room.Board,6.164e-05,1.46e-05,4.213,0.000,3.29e-05,9.04e-05

0,1,2,3
Omnibus:,11.284,Durbin-Watson:,1.968
Prob(Omnibus):,0.004,Jarque-Bera (JB):,18.43
Skew:,-0.084,Prob(JB):,9.95e-05
Kurtosis:,3.827,Cond. No.,4650.0


In [17]:
alphas = 10**np.linspace(10,-2,100)*0.5
ridge_cv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=10)

In [18]:
ridge_cv.fit(x_train, y_train)

In [19]:
y_pred = ridge_cv.predict(x_test)

In [20]:
test_mse = mean_squared_error(y_test, y_pred)
print(f'Test MSE: {test_mse}')

Test MSE: 0.07088633588144291


In [21]:
print(f"Optimal λ (alpha): {ridge_cv.alpha_}")
print(f"Test MSE: {test_mse}")

Optimal λ (alpha): 814.8754173103217
Test MSE: 0.07088633588144291


In [24]:
lasso_cv = LassoCV(cv=10, random_state=42).fit(x_train, y_train)

In [26]:
y_pred = lasso_cv.predict(x_test)

In [27]:
lasso_mse = mean_squared_error(y_test, y_pred)
print(f'Test MSE: {lasso_mse}')

Test MSE: 0.07711122258419881


In [29]:
nonzero_coefs = np.sum(lasso_cv.coef_ != 0)
print(f"Number of non-zero coefficients: {nonzero_coefs}")

Number of non-zero coefficients: 6


In [30]:
print(f"Optimal λ (alpha): {lasso_cv.alpha_}")


Optimal λ (alpha): 11.232972275158785


In [32]:
max_components = min(x_train.shape[0], x_train.shape[1])
pca = PCA(n_components=max_components)
pca.fit(x_train)

In [34]:
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

In [35]:
linear_model = LinearRegression()
linear_model.fit(x_train_pca, y_train)

In [36]:
y_pred = linear_model.predict(x_test_pca)

In [38]:
pcr_test_mse = mean_squared_error(y_test, y_pred)
print(f'Test MSE: {pcr_test_mse}')

Test MSE: 0.0710962636939501


In [46]:
max_components = min(x_train.shape[0], x_train.shape[1])
pls = PLSRegression(n_components=max_components)
cv = KFold(n_splits=10, shuffle=True, random_state=42)
mse = -cross_val_score(pls, x_train, y_train, cv=cv, scoring='neg_mean_squared_error')

pls.fit(x_train, y_train)

In [47]:
x_train_pls = pls.transform(x_train)
x_test_pls = pls.transform(x_test)

In [48]:
best_m_pls = np.argmin(cross_val_score(LinearRegression(), x_train_pls, y_train, cv=10))

In [49]:
pls_best = PLSRegression(n_components=best_m_pls+1)
pls_best.fit(x_train, y_train)

In [50]:
y_pred = pls_best.predict(x_test)

In [51]:
pls_test_mse = mean_squared_error(y_test, y_pred)
print(f'Test MSE: {pls_test_mse}')

Test MSE: 0.07235567075870665


In [55]:
Boston = load_data("Boston")

In [56]:
Boston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [57]:
X = Boston.drop('crim', axis=1).values
y = Boston['crim'].values

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [59]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [60]:
results = {}

In [61]:
lasso_cv = LassoCV(cv=10, random_state=42).fit(X_train_scaled, y_train)
y_pred_lasso = lasso_cv.predict(X_test_scaled)
lasso_test_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_nonzero = np.sum(lasso_cv.coef_ != 0)


In [62]:
results['LASSO'] = {'Test MSE': lasso_test_mse, 'Optimal Lambda': lasso_cv.alpha_, 'Non-zero Coefs': lasso_nonzero}


In [63]:
alphas = np.logspace(-4, 4, 100)
ridge_cv = RidgeCV(alphas=alphas, cv=10).fit(X_train_scaled, y_train)
y_pred_ridge = ridge_cv.predict(X_test_scaled)
ridge_test_mse = mean_squared_error(y_test, y_pred_ridge)

In [64]:
results['Ridge'] = {'Test MSE': ridge_test_mse, 'Optimal Lambda': ridge_cv.alpha_}


In [65]:
max_components = X_train_scaled.shape[1]
pcr_errors = []

for m in range(1, max_components + 1):
    pca = PCA(n_components=m)
    X_train_pca = pca.fit_transform(X_train_scaled)
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = -cross_val_score(LinearRegression(), X_train_pca, y_train, cv=cv, scoring='neg_mean_squared_error')
    pcr_errors.append(scores.mean())

In [66]:
best_m_pcr = np.argmin(pcr_errors) + 1
pca = PCA(n_components=best_m_pcr)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [67]:
lm_pcr = LinearRegression().fit(X_train_pca, y_train)
y_pred_pcr = lm_pcr.predict(X_test_pca)
pcr_test_mse = mean_squared_error(y_test, y_pred_pcr)

In [68]:
results['PCR'] = {'Test MSE': pcr_test_mse, 'Optimal Components': best_m_pcr}


In [69]:
pls_errors = []

for m in range(1, max_components + 1):
    pls = PLSRegression(n_components=m)
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    mse_scores = []

    for train_idx, val_idx in cv.split(X_train_scaled):
        pls.fit(X_train_scaled[train_idx], y_train[train_idx])
        y_val_pred = pls.predict(X_train_scaled[val_idx])
        mse_scores.append(mean_squared_error(y_train[val_idx], y_val_pred))

    pls_errors.append(np.mean(mse_scores))

In [70]:
best_m_pls = np.argmin(pls_errors) + 1
pls = PLSRegression(n_components=best_m_pls).fit(X_train_scaled, y_train)
y_pred_pls = pls.predict(X_test_scaled)
pls_test_mse = mean_squared_error(y_test, y_pred_pls)

In [71]:
results['PLS'] = {'Test MSE': pls_test_mse, 'Optimal Components': best_m_pls}


In [72]:
results

{'LASSO': {'Test MSE': 25.56356553171895,
  'Optimal Lambda': 0.0442860563159877,
  'Non-zero Coefs': 11},
 'Ridge': {'Test MSE': 25.95281941649804, 'Optimal Lambda': 79.24828983539186},
 'PCR': {'Test MSE': 25.46873410559455, 'Optimal Components': 12},
 'PLS': {'Test MSE': 25.482753375409022, 'Optimal Components': 9}}