# Intro to Regularization

In [2]:
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge

## Data

In [5]:
np.random.seed(42)

x1 = np.array([1,2,3,4,5])
x2 = np.array([2,4,6,8, 10])
x3 = np.array([0, -1, 2, -3, 4])
X = np.c_[x1, x2, x3]

y = x1 + x3 + np.random.normal(0, 1, 5)

In [6]:
y

array([1.49671415, 0.8617357 , 5.64768854, 2.52302986, 8.76584663])

In [7]:
model = LinearRegression()
model.fit(X,y)

lasso_model = Lasso(alpha=1)
lasso_model.fit(X,y)

ridge_model = Ridge(alpha=1)
ridge_model.fit(X,y)

print(f"OLS: {model.coef_}")
print(f"LASSO: {lasso_model.coef_}")
print(f"Ridge: {ridge_model.coef_}")

OLS: [0.22794338 0.45588677 0.80039832]
LASSO: [0.         0.48587976 0.66367724]
Ridge: [0.22641309 0.45282619 0.77541522]


The ridge coefficients are smaller than the OLS. What if we change alpha = 2?

In [8]:
model = LinearRegression()
model.fit(X,y)

lasso_model = Lasso(alpha=2)
lasso_model.fit(X,y)

ridge_model = Ridge(alpha=2)
ridge_model.fit(X,y)

print(f"OLS: {model.coef_}")
print(f"LASSO: {lasso_model.coef_}")
print(f"Ridge: {ridge_model.coef_}")

OLS: [0.22794338 0.45588677 0.80039832]
LASSO: [0.         0.40189421 0.52695897]
Ridge: [0.22474116 0.44948232 0.75216979]


We want to first scale our data. If our features has different, there will be values that will be penalized by definition of L1 and L2 Regularization

## Data Scaling

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

Standard scaler refers to $\frac{x-\mu}{\sigma}$ while MinMaxScaler refers to $\frac{\text{max}-x}{\text{max}-\text{min}}$

In [21]:
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train_scaled = scaler.fit_transform(X_train)

# print(X_train)
# print(X_train_scaled)

model = LinearRegression()
model.fit(X_train_scaled, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Do we need to scale y if we scaled X? No, it is not a necessary step because the weights will do it for us.

In [22]:
X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)
scaled_MSE = mean_squared_error(y_pred, y_test)



In [23]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
MSE = mean_squared_error(y_pred, y_test)

print(scaled_MSE, MSE)

1.1921168895644256 1.1921168895644274


In [26]:
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train_scaled = scaler.fit_transform(X_train)

# print(X_train)
# print(X_train_scaled)

model = Lasso(alpha=0.1)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)
scaled_MSE = mean_squared_error(y_pred, y_test)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
MSE = mean_squared_error(y_pred, y_test)

print(scaled_MSE, MSE)

1.4663791635557513 1.2869397009782033




When we increase the alpha for LASSO, why do the MSEs converge? Because if alpha approaches infinity, the weights approach 0.

In [27]:
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train_scaled = scaler.fit_transform(X_train)

# print(X_train)
# print(X_train_scaled)

model = Ridge(alpha=0.1)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)
scaled_MSE = mean_squared_error(y_pred, y_test)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
MSE = mean_squared_error(y_pred, y_test)

print(scaled_MSE, MSE)

1.274161569300757 1.2053927215896052




For Ridge, it is not the case that the MSEs will converge to one value since L2 does not penalize weights

## Hyperparameter Optimization for Regularization

Alpha can be treated as a hyperparameter so we wnat to do hyperparameter tuning.

In [28]:
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), Lasso(alpha = 0.1))
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mean_squared_error(y_pred, y_test)

## Choosing Alpha 🐺

In [31]:
from sklearn.model_selection import cross_val_score

### OLS

In [34]:
model = LinearRegression()
score = -cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = 5).mean()

score

2.9910851704805217

### LASSO

In [37]:
alphas = [i * 0.1 for i in list(range(1,11))]

for a in alphas:
    model = make_pipeline(StandardScaler(), Lasso(alpha = a))
    score = -cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = 5).mean()
    print(score)

2.605287275459357
2.4708528291028933
2.5843813995864635
2.945872986910041
3.5553275910736195
4.412745212077227
5.518125849920848
6.871469504604482
8.47279714307533
9.681086986126681




### Ridge

In [38]:
alphas = [i * 0.1 for i in list(range(1,11))]

for a in alphas:
    model = make_pipeline(StandardScaler(), Ridge(alpha = a))
    score = -cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = 5).mean()
    print(score)

2.0512948718612125
1.5020647946757457
1.1639365570607634
0.950767291117357
0.81693562497872
0.7363362732241147
0.693060251042419
0.6768905351141278
0.6809685702051782
0.7005133940528082




### Grid Search

In [39]:
from sklearn.model_selection import GridSearchCV

In [42]:
lasso_model = make_pipeline(StandardScaler(), Lasso())
ridge_model = make_pipeline(StandardScaler(), Ridge())

lasso_parameters = {'lasso__alpha' : [0.01 * i for i in list(range(1,101))]}
lasso_reg = GridSearchCV(
    estimator = lasso_model,
    param_grid = lasso_parameters,
    scoring = "neg_mean_squared_error",
    cv = 5)

lasso_reg.fit(X, y)
print(lasso_reg.best_estimator_)



















Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=0.2, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])




alpha = 0.2

In [43]:
ridge_parameters = {'ridge__alpha' : [0.01 * i for i in list(range(1,101))]}
ridge_reg = GridSearchCV(
    estimator = ridge_model,
    param_grid = ridge_parameters,
    scoring = "neg_mean_squared_error",
    cv = 5)

ridge_reg.fit(X, y)
print(ridge_reg.best_estimator_)



















Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge', Ridge(alpha=0.8300000000000001, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001))])




alpha = 0.83

## Exercise: The Diabetes Dataset

Find the optimal alpha values for LASSO and Ridge Regression.

Which method (OLS, LASSO, Ridge) is best for this dataset?

What happens when you change Standard to MinMaxScaler?

In [49]:
from sklearn.datasets import load_diabetes
import pandas as pd

diabetes = load_diabetes()
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [50]:
X, y = diabetes.data, diabetes.target

# Lasso regression model
lasso_model = make_pipeline(StandardScaler(), Lasso())
lasso_parameters = {'lasso__alpha': [0.01 * i for i in list(range(1, 101))]}
lasso_reg = GridSearchCV(
    estimator=lasso_model,
    param_grid=lasso_parameters,
    scoring="neg_mean_squared_error",
    cv=5
)
lasso_reg.fit(X, y)
print("Best Lasso model:", lasso_reg.best_estimator_)
print("Best Lasso alpha:", lasso_reg.best_params_['lasso__alpha'])

# Ridge regression model
ridge_model = make_pipeline(StandardScaler(), Ridge())
ridge_parameters = {'ridge__alpha': [0.01 * i for i in list(range(1, 101))]}
ridge_reg = GridSearchCV(
    estimator=ridge_model,
    param_grid=ridge_parameters,
    scoring="neg_mean_squared_error",
    cv=5
)
ridge_reg.fit(X, y)
print("Best Ridge model:", ridge_reg.best_estimator_)
print("Best Ridge alpha:", ridge_reg.best_params_['ridge__alpha'])



Best Lasso model: Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=0.08, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])
Best Lasso alpha: 0.08
Best Ridge model: Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge', Ridge(alpha=0.2, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])
Best Ridge alpha: 0.2




In [52]:
model = LinearRegression()

ols_score = -cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = 5).mean()
lasso_score = -lasso_reg.best_score_
ridge_score = -ridge_reg.best_score_

# Print the Mean Squared Errors for comparison
print(f"Mean Squared Error for Lasso: {lasso_score}")
print(f"Mean Squared Error for Ridge: {ridge_score}")
print(f"Mean Squared Error for OLS: {ols_score}")

Mean Squared Error for Lasso: 2991.474574663583
Mean Squared Error for Ridge: 2992.5925685897682
Mean Squared Error for OLS: 2993.072943299886


In [53]:
# Lasso regression model with optimal alpha
lasso_model = make_pipeline(StandardScaler(), Lasso(alpha=0.08))
lasso_model.fit(X, y)
y_pred_lasso = lasso_model.predict(X)
mse_lasso = mean_squared_error(y, y_pred_lasso)

# Ridge regression model with optimal alpha
ridge_model = make_pipeline(StandardScaler(), Ridge(alpha=0.2))
ridge_model.fit(X, y)
y_pred_ridge = ridge_model.predict(X)
mse_ridge = mean_squared_error(y, y_pred_ridge)

# Ordinary Least Squares (OLS) regression model
ols_model = make_pipeline(StandardScaler(), LinearRegression())
ols_model.fit(X, y)
y_pred_ols = ols_model.predict(X)
mse_ols = mean_squared_error(y, y_pred_ols)

# Print the Mean Squared Errors for comparison
print(f"Mean Squared Error for Lasso: {mse_lasso}")
print(f"Mean Squared Error for Ridge: {mse_ridge}")
print(f"Mean Squared Error for OLS: {mse_ols}")

Mean Squared Error for Lasso: 2861.643416640848
Mean Squared Error for Ridge: 2859.7471306064785
Mean Squared Error for OLS: 2859.6903987680657


In [54]:
from sklearn.preprocessing import MinMaxScaler

# Lasso regression model
lasso_model = make_pipeline(MinMaxScaler(), Lasso())
lasso_parameters = {'lasso__alpha': [0.01 * i for i in list(range(1, 101))]}
lasso_reg = GridSearchCV(
    estimator=lasso_model,
    param_grid=lasso_parameters,
    scoring="neg_mean_squared_error",
    cv=5
)
lasso_reg.fit(X, y)
print("Best Lasso model:", lasso_reg.best_estimator_)
print("Best Lasso alpha:", lasso_reg.best_params_['lasso__alpha'])

# Ridge regression model
ridge_model = make_pipeline(MinMaxScaler(), Ridge())
ridge_parameters = {'ridge__alpha': [0.01 * i for i in list(range(1, 101))]}
ridge_reg = GridSearchCV(
    estimator=ridge_model,
    param_grid=ridge_parameters,
    scoring="neg_mean_squared_error",
    cv=5
)
ridge_reg.fit(X, y)
print("Best Ridge model:", ridge_reg.best_estimator_)
print("Best Ridge alpha:", ridge_reg.best_params_['ridge__alpha'])

model = LinearRegression()

ols_score = -cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = 5).mean()
lasso_score = -lasso_reg.best_score_
ridge_score = -ridge_reg.best_score_

# Print the Mean Squared Errors for comparison
print(f"Mean Squared Error for Lasso: {lasso_score}")
print(f"Mean Squared Error for Ridge: {ridge_score}")
print(f"Mean Squared Error for OLS: {ols_score}")



Best Lasso model: Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('lasso', Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])
Best Lasso alpha: 0.01
Best Ridge model: Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('ridge', Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])
Best Ridge alpha: 0.01
Mean Squared Error for Lasso: 2991.8002654486027
Mean Squared Error for Ridge: 2992.573483190658
Mean Squared Error for OLS: 2993.072943299886




What is the best? For this particular dataset, we should use Ridge more since we only have a few features.

LASSO would shrink the features and we might need all the features.

What's one example when we can better use LASSO? Predicting a family's income level.