In [1]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error

# Load the diabetes dataset
data = load_diabetes()
X, y = data.data, data.target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ordinary Least Squares (OLS) - No regularization
ols = LinearRegression()
ols.fit(X_train, y_train)

# Predictions and performance
ols_train_pred = ols.predict(X_train)
ols_test_pred = ols.predict(X_test)

ols_train_r2 = r2_score(y_train, ols_train_pred)
ols_test_r2 = r2_score(y_test, ols_test_pred)
ols_test_mse = mean_squared_error(y_test, ols_test_pred)

# Ridge Regression (with alpha = 1.0)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Predictions and performance
ridge_train_pred = ridge.predict(X_train)
ridge_test_pred = ridge.predict(X_test)

ridge_train_r2 = r2_score(y_train, ridge_train_pred)
ridge_test_r2 = r2_score(y_test, ridge_test_pred)
ridge_test_mse = mean_squared_error(y_test, ridge_test_pred)

# Print results
print("OLS (No Regularization):")
print(f"Training R²: {ols_train_r2:.4f}")
print(f"Test R²: {ols_test_r2:.4f}")
print(f"Test MSE: {ols_test_mse:.4f}\n")

print("Ridge Regression (alpha = 1.0):")
print(f"Training R²: {ridge_train_r2:.4f}")
print(f"Test R²: {ridge_test_r2:.4f}")
print(f"Test MSE: {ridge_test_mse:.4f}")

# Compare coefficients
print("\nSample Coefficients (First 5 Features):")
print(f"OLS: {ols.coef_[:5]}")
print(f"Ridge: {ridge.coef_[:5]}")

OLS (No Regularization):
Training R²: 0.5279
Test R²: 0.4526
Test MSE: 2900.1936

Ridge Regression (alpha = 1.0):
Training R²: 0.4424
Test R²: 0.4192
Test MSE: 3077.4159

Sample Coefficients (First 5 Features):
OLS: [  37.90402135 -241.96436231  542.42875852  347.70384391 -931.48884588]
Ridge: [ 45.36737726 -76.66608563 291.33883165 198.99581745  -0.53030959]
