In [279]:
import pandas as pd
# Regularization : 
# set of technique used in ML to prevent overfitting model to bestfit model 
# reduce the complexity of model


In [280]:
# We need to penalize the terms ,not become very high (those terms discouraging overly complex models)
# Initially there is no constraint on the values of constants of higher order terms

In [281]:
# Constraint optimization 
# Add the lambda times of sum of squares of constants to loss function
# As The new loss function has greater rss because of constraint
# The OLS model will try to decrease the RSS by reducing the contributions of each weight (the already high ones will be little lower but the smaller weights will start tending to zero..)
# The model will now try to choose terms which have lower weights as well as lower loss too (maintaining a balance)

In [282]:
# The overfitted model will have loss tending to zero so now adding constraint to loss function will try to create such weights such that loss should be less making the weights lower

In [283]:
# Types of Regularization

In [284]:
# Ridge (L-2 norm) -> Adding the constriant as sum of squares of weights 

In [285]:
# Lasso (L-1 norm) -> Adding the constraint as absolute of weights
# In this the weights could be zero too that's why we can use this to do feature selection

In [286]:
# =============================================================================
# L1 vs L2 REGULARIZATION COMPARISON TABLE
# =============================================================================

# +------------------------+-------------------------+-------------------------+
# | Aspect                 | L1 (Lasso)             | L2 (Ridge)             |
# +------------------------+-------------------------+-------------------------+
# | Penalty Function       | λ * Σ|wj|              | λ * Σwj²               |
# +------------------------+-------------------------+-------------------------+
# | Constraint Shape       | Diamond/Square          | Circle/Sphere          |
# +------------------------+-------------------------+-------------------------+
# | Sparsity              | Yes (exact zeros)       | No (shrinks to zero)   |
# +------------------------+-------------------------+-------------------------+
# | Feature Selection     | Automatic               | Manual                 |
# +------------------------+-------------------------+-------------------------+
# | Solution Method       | Iterative              | Closed-form            |
# +------------------------+-------------------------+-------------------------+
# | Computational Cost    | Higher                 | Lower                  |
# +------------------------+-------------------------+-------------------------+
# | Handles Correlated    | Picks one arbitrarily  | Distributes weights    |
# | Features              |                        |                        |
# +------------------------+-------------------------+-------------------------+
# | Stability             | Less stable            | More stable            |
# +------------------------+-------------------------+-------------------------+
# | Best Use Case         | Feature selection      | Weight shrinkage       |
# |                       | High dimensions        | Multicollinearity      |
# +------------------------+-------------------------+-------------------------+

# =============================================================================
# GEOMETRIC SHAPES (CONCEPTUAL)
# =============================================================================

# L1 CONSTRAINT REGION (DIAMOND/SQUARE)
# ◊ Sharp corners and edges
# ◊ Flat sides meet at 90-degree angles  
# ◊ Corner points lie exactly on coordinate axes
# ◊ High probability of solution landing on corners (where weights = 0)

# L2 CONSTRAINT REGION (CIRCLE/SPHERE)
# ○ Smooth, curved boundary
# ○ No sharp corners or edges
# ○ Never touches coordinate axes exactly
# ○ Low probability of weights being exactly zero

# =============================================================================
# MATHEMATICAL FORMULATION
# =============================================================================

# LASSO (L1) LOSS FUNCTION:
# L_lasso(w) = (1/2n) * Σ(yi - xi^T w)² + λ * Σ|wj|
#              ↑_________________↑        ↑________↑
#                   RSS term           L1 penalty

# RIDGE (L2) LOSS FUNCTION:  
# L_ridge(w) = (1/2n) * Σ(yi - xi^T w)² + λ * Σwj²
#              ↑_________________↑        ↑_______↑
#                   RSS term           L2 penalty

# =============================================================================
# KEY BEHAVIORAL DIFFERENCES
# =============================================================================

# L1 (LASSO) BEHAVIOR:
# - Creates sparse solutions (many weights = 0)
# - Performs automatic feature selection
# - Can completely eliminate irrelevant features
# - Sharp penalty creates "all-or-nothing" effect

# L2 (RIDGE) BEHAVIOR:
# - Shrinks all weights proportionally
# - Keeps all features in the model
# - Weights approach but never reach exactly zero
# - Smooth penalty creates gradual shrinkage

# =============================================================================
# PRACTICAL IMPACT ON COEFFICIENTS
# =============================================================================

# ORIGINAL WEIGHTS (NO REGULARIZATION):
# weights = [5.2, 12.8, -0.3, 0.1, -15.6, 2.1, 0.05, -8.4]

# AFTER L1 (LASSO) REGULARIZATION:  
# weights = [4.1, 10.2, 0.0, 0.0, -12.3, 1.8, 0.0, -6.2]  
# ↑ Some weights become exactly 0 (feature selection)

# AFTER L2 (RIDGE) REGULARIZATION:
# weights = [3.8, 9.1, -0.2, 0.08, -11.2, 1.5, 0.03, -6.8]  
# ↑ All weights shrunk but remain non-zero

# =============================================================================
# WHEN TO CHOOSE WHICH
# =============================================================================

# CHOOSE L1 (LASSO) WHEN:
# - You have many features but expect only few are important
# - You want automatic feature selection
# - Interpretability is crucial (need sparse model)
# - High-dimensional data (more features than samples)
# - Examples: Gene selection, text classification, image processing

# CHOOSE L2 (RIDGE) WHEN:
# - All features might contribute somewhat to the prediction
# - You have multicollinearity problems
# - You want stable, robust predictions
# - Computational efficiency is important
# - Examples: Time series forecasting, financial modeling, sensor data analysis

# =============================================================================
# SOLUTION ALGORITHMS
# =============================================================================

# L1 (LASSO) SOLUTION:
# - No closed-form solution
# - Requires iterative methods:
#   * Coordinate descent
#   * Proximal gradient methods
#   * Subgradient methods
# - Soft thresholding: w_new = sign(w) * max(0, |w| - λ)

# L2 (RIDGE) SOLUTION:
# - Has closed-form solution:
#   w_ridge = (X^T X + λI)^(-1) X^T y
# - Can be computed directly using matrix operations
# - Much faster computation

In [287]:
# =============================================================================
# GEOMETRIC VISUALIZATION (ASCII ART)
# =============================================================================

# L1 CONSTRAINT (DIAMOND SHAPE) - 2D Case
# Constraint: |w1| + |w2| ≤ t
#
#         w2
#          |
#          |    /\
#          |   /  \
#      ----+--/----\----w1
#          | /      \
#          |/        \
#          +----------
#         /|          \
#        / |           \
#       \  |           /
#        \ |          /
#         \|         /
#          +--------
#          |\      /
#          | \    /
#          |  \  /
#          |   \/
#          |
# Sharp corners at axes → High probability of w=0

# L2 CONSTRAINT (CIRCLE SHAPE) - 2D Case  
# Constraint: w1² + w2² ≤ t
#
#         w2
#          |
#          |   ****
#          | **    **
#      ----+*--------*----w1
#          |*        *
#          | **    **
#          |   ****
#          |
# Smooth curve → Low probability of exactly hitting w=0

In [288]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

X = housing.data
Y = housing.target
features = housing.feature_names
description = housing.DESCR
print(description)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [289]:
from sklearn.model_selection import train_test_split
X_train , X_test, Y_train , Y_test = train_test_split(X , Y , train_size=0.7 , random_state=42)

In [290]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [291]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
lr = LinearRegression()
lr.fit(X_train ,Y_train)
y_pred = lr.predict(X_test)
r2_score(Y_test , y_pred)

0.5957681720649695

In [292]:
lr.coef_

array([ 8.49233996e-01,  1.22130962e-01, -2.99521134e-01,  3.48377554e-01,
       -8.85561779e-04, -4.16992130e-02, -8.93880954e-01, -8.68628225e-01])

In [293]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
ridge = Ridge(alpha=0.1) # alpha decides the lambda in theory higher alpha higher penalty
ridge.fit(X_train ,Y_train)
y_pred = ridge.predict(X_test)
print(mean_squared_error(Y_test , y_pred=y_pred))
print(r2_score(Y_test , y_pred))

0.5304834843385456
0.5957700127249194


In [294]:
ridge.coef_

array([ 8.49227048e-01,  1.22139889e-01, -2.99496212e-01,  3.48346828e-01,
       -8.82524162e-04, -4.16996542e-02, -8.93795621e-01, -8.68541682e-01])

In [295]:
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score , mean_squared_error
lasso = Lasso(alpha=0.5) # alpha decides the lambda in theory higher alpha higher penalty
lasso.fit(X_train ,Y_train)

y_pred = lasso.predict(X_test)
print(mean_squared_error(Y_test , y_pred=y_pred))
print(r2_score(Y_test , y_pred))

0.9355843191720238
0.2870819759728527


In [296]:
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# 1. Scale features (Ridge is scale-sensitive!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Use RidgeCV to find optimal alpha
alphas = np.logspace(-3, 3, 50)  # 50 values from 0.001 to 1000
ridge_cv = RidgeCV(alphas=alphas, cv=5, scoring='r2')
ridge_cv.fit(X_train_scaled, Y_train)

# 3. Results
print(f"✅ Optimal alpha: {ridge_cv.alpha_:.4f}")
print(f"   Best CV R²: {ridge_cv.best_score_:.4f}")

# 4. Final evaluation
y_pred = ridge_cv.predict(X_test_scaled)
train_r2 = ridge_cv.score(X_train_scaled, Y_train)
test_r2 = r2_score(Y_test, y_pred)
test_mse = mean_squared_error(Y_test, y_pred)

print(f"\n📊 Final Performance:")
print(f"   Train R²: {train_r2:.4f}")
print(f"   Test R²:  {test_r2:.4f}")
print(f"   Test MSE: {test_mse:.4f}")
print(f"   Overfitting gap: {train_r2 - test_r2:.4f}")

# 5. Check coefficients
print(f"\n🔍 Model Complexity:")
print(f"   Max coefficient: {np.max(np.abs(ridge_cv.coef_)):.4f}")
print(f"   Number of features: {len(ridge_cv.coef_)}")

✅ Optimal alpha: 6.2506
   Best CV R²: 0.6066

📊 Final Performance:
   Train R²: 0.6093
   Test R²:  0.5959
   Test MSE: 0.5303
   Overfitting gap: 0.0135

🔍 Model Complexity:
   Max coefficient: 0.8886
   Number of features: 8


In [297]:
lasso.coef_

array([ 0.29662158,  0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        ])