In [184]:
import pandas as pd

import mglearn
import random
import numpy as np

def train_test_split(X, y, test_size=0.25, random_state=None):
    
    # Set random seed for reproducibility if random_state is provided
    if random_state is not None:
        np.random.seed(random_state)
        
        
    # Get the total number of samples
    n_samples = len(X)
    
    
    # Create an array of indices and shuffle them
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    # Determine the number of samples for the test set
    if isinstance(test_size, float):
        test_size = int(test_size * n_samples)
        
        
    # Extract indices for the test and training sets
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    
    
    # Use indices to split the data into training and testing sets
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

def fit_linear_regression(X, y):
    
    # Add a column of ones to X for the intercept term
    X_ext = np.column_stack((np.ones(len(X)), X))
    
    # Calculate coefficients using the normal equation
    coefficients = np.dot(np.dot(np.linalg.pinv(np.dot(X_ext.T, X_ext)), X_ext.T), y)
    
    return coefficients

def predict(X, coefficients):
    
    
    # Add a column of ones to X for the intercept term
    X_ext = np.column_stack((np.ones(len(X)), X))
    
    # Calculate predicted target values using dot product
    y_pred = np.dot(X_ext, coefficients)
    
    return y_pred
    
def mean_squared_error(y_true, y_pred):
    
    
    # Ensure that the input arrays have the same length
    if len(y_true) != len(y_pred):
        raise ValueError("Input arrays must have the same length.")

    # Calculate squared differences
    squared_diff = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]

    # Calculate mean squared error
    mse = sum(squared_diff) / len(y_true)

    return mse
# Load the extended Boston Housing dataset
X, y = mglearn.datasets.load_extended_boston()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




# Fit the linear regression model on the training set
coefficients = fit_linear_regression(X_train, y_train)

# Make predictions on the training set
y_train_pred = predict(X_train, coefficients)

# Make predictions on the test set
y_pred = predict(X_test, coefficients)


# Calculate Mean Squared Error (MSE) on the test set
mse_test = mean_squared_error(y_test, y_pred)


# Calculate Mean Squared Error (MSE) on the training set
mse_train = mean_squared_error(y_train, y_train_pred)
print("Mean Squared Error on Training Set:", mse_train)
print("Mean Squared Error on Test Set:", mse_test) 

Mean Squared Error on Training Set: 5.1199691799219735
Mean Squared Error on Test Set: 14.32943419266377



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

## Ridge Regression

Ridge regression, also known as Tikhonov regularization or L2 regularization, takes the least squares function and adds a regularization term to it. This is typically used to prevent overfitting.

The ridge regression function is given by:

$$
\text{J(θ)} = MSE(θ) + α\sum_{i=1}^{n} (θ_i)^2 
$$

Where:
- $\text{J(θ)}$ is the cost function to be minimized.
- $MSE(θ)$ is the Mean Squared Error term.
- $α$ is the regularization parameter.
- $θ_i$ are the regression coefficients.

The ridge regression coefficients are obtained by minimizing this cost function:

$$
\hat{θ} = \text{argmin}_θ J(θ)
$$

The closed-form solution for ridge regression is given by:

$$
\hat{θ} = (X^T X + αI)^{-1} X^T y
$$

Here:
- $\hat{θ}$ is the vector of ridge regression coefficients.
- $X$ is the matrix of input features.
- $y$ is the vector of target values.
- $I$ is the identity matrix.

In [185]:
import numpy as np

# closed-form approach
def RidgeRegression(X, Y, alpha):
    X_b = np.c_[np.ones((X.shape[0], 1)), X]
    I = np.identity(X_b.shape[1])
    I[0, 0] = 0
    
    theta = np.linalg.inv(X_b.T.dot(X_b) + alpha * I).dot(X_b.T).dot(Y)
    
    # Separate bias and weights
    w = theta[1:]
    b = theta[0]

    return w, b

Iterative methods like gradient descent are also used for ridge regression.

The cost function can also be written as:

$$
\text{J(θ)} = \frac{1}{2m}(\sum_{i=1}^{m} (h_θ(x^{(i)})-y^{(i)})^2 + α\sum_{j=1}^{n}(θ_j)^2)
$$

with the derivative with respect to $w_j$ (weights) when $θ_j$, $j \ge 1$:

$$
\frac{\partial J(θ)}{\partial w_j} = \frac{1}{m}(\sum_{i=1}^{m} (h_θ(x^{(i)})-y^{(i)})\cdot x_j^{(i)} + α\cdot w_j)
$$

with the derivate with respect to $b$ (bias) when $θ_0$:

$$
\frac{\partial J(θ)}{\partial b} = \frac{1}{m}\sum_{i=1}^{m} (h_θ(x^{(i)})-y^{(i)})
$$

Where:
- $m$ is the number of training examples.
- $h_θ(x^{(i)})$ is the predicted value of the $i$-th example.
- $y^{(i)}$ is the actual value of the $i$-th example.
- $θ_j$ are the regression coefficients.
- $n$ is the number of input features.
- $α$ is the regularization parameter.

In [174]:
# gradient descent approach
def gdRR(X, y, alpha, lr, n_iterations):
    m, n = X.shape
    W = np.zeros(n)
    b = 0
    
    for i in range(n_iterations):
        pred = np.dot(X, W) + b
        diff = pred - y
        dJdW = (1/m) * np.dot(X.T, diff) + alpha * W
        dJdb = (1/m) * np.sum(diff)
        
        W -= lr * dJdW
        b -= lr * dJdb
    
    return W, b

In [211]:
test_alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

results = []
w_set = []
b_set = []

for alpha in test_alphas:
  w, b = RidgeRegression(X_train, y_train, alpha)
  Y_pred = np.dot(X_test, w) + b

  results.append(mean_squared_error(y_test, Y_pred))
  w_set.append(w)
  b_set.append(b)

w_set = [w[0] for w in w_set]

results_gd = []

for alpha in test_alphas:
    w1, b1 = gdRR(X_train, y_train, alpha, lr = 0.01, n_iterations = 100)
    Y_pred = np.dot(X_test, w1) + b1
    results_gd.append(mean_squared_error(y_test, Y_pred))

print(results_gd)

data = {"Alpha": test_alphas, "w": w_set, "b": b_set, "MSE": results}
df = pd.DataFrame(data)
display(df)

[60.601440603152554, 60.67839785420658, 61.50384459946839, 72.37612588307135, 116.77088424301624, 124.37146593347452, 1.2713103417502652e+190]


Unnamed: 0,Alpha,w,b,MSE
0,0.001,5.288569,-25.050419,13.519711
1,0.01,-5.665918,-14.889592,12.257621
2,0.1,-3.472767,6.438892,11.013404
3,1.0,-1.493739,19.766685,12.710819
4,10.0,-0.856798,23.426068,20.17991
5,100.0,-0.293358,25.241065,33.025438
6,1000.0,-0.086045,24.77684,53.97018


From the above table, it can be seen that the lowest MSE for this dataset is when α = 0.1, which is our optimal regularization.

The ridge regression function can be found from sklearn.linear_model.

In [214]:
from sklearn.linear_model import Ridge

for alpha in test_alphas:
    ridge_model = Ridge(alpha=alpha)

    # Train the model
    ridge_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = ridge_model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")

Mean Squared Error: 13.519710896809798
Mean Squared Error: 12.257621202213597
Mean Squared Error: 11.013404437259275
Mean Squared Error: 12.710819494074418
Mean Squared Error: 20.179909813929285
Mean Squared Error: 33.025437771544546
Mean Squared Error: 53.970179940917824


As you can see, the MSE of the different alpha levels match.