1. Implement Linear Regression and calculate sum of residual error on the following
Datasets.
x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
y = [1, 3, 2, 5, 7, 8, 8, 9, 10, 12]
 Compute the regression coefficients using analytic formulation and calculate Sum
Squared Error (SSE) and R 2 value.
 Implement gradient descent (both Full-batch and Stochastic with stopping
criteria) on Least Mean Square loss formulation to compute the coefficients of
regression matrix and compare the results using performance measures such as R 2
SSE etc.

In [5]:
import numpy as np

x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])

def analytic_lr(x, y):
    n = len(x)
    x_mean = np.mean(x)
    y_mean = np.mean(y)
    
    num = np.sum((x - x_mean) * (y - y_mean))
    denom = np.sum((x - x_mean) ** 2)
    m = num / denom
    c = y_mean - m * x_mean
    
    y_pred = m * x + c
    sse = np.sum((y - y_pred) ** 2)
    
    ss_total = np.sum((y - y_mean) ** 2)
    r_sq = 1 - (sse / ss_total)
    
    return sse, r_sq

def full_gd_lr(x, y, lr, iterations):
    n = len(x)
    m = 0
    c = 0
    
    for _ in range(iterations):
        y_pred = m * x + c
        dm = (-2/n) * np.sum(x * (y - y_pred))
        dc = (-2/n) * np.sum(y - y_pred)
        
        m -= lr * dm
        c -= lr * dc
    
    y_pred_final = m * x + c
    sse = np.sum((y - y_pred_final) ** 2)
    
    y_mean = np.mean(y)
    ss_total = np.sum((y - y_mean) ** 2)
    r_sq = 1 - (sse / ss_total)
    
    return sse, r_sq

def stochastic_gd_lr(x, y, lr, iterations):
    n = len(x)
    m = 0
    c = 0
    
    for _ in range(iterations):
        for i in range(n):
            y_pred = m * x[i] + c
            dm = (-2) * x[i] * (y[i] - y_pred)
            dc = (-2) * (y[i] - y_pred)
            
            m -= lr * dm
            c -= lr * dc
    
    y_pred_final = m * x + c
    sse = np.sum((y - y_pred_final) ** 2)
    
    y_mean = np.mean(y)
    ss_total = np.sum((y - y_mean) ** 2)
    r_sq = 1 - (sse / ss_total)
    
    return sse, r_sq

lr = 0.01
iterations = 1000
lr_stochastic = 0.01
iterations_stochastic = 100

sse_analytic, r_sq_analytic = analytic_lr(x, y)
sse_gd_full, r_sq_gd_full = full_gd_lr(x, y, lr, iterations)
sse_gd_stochastic, r_sq_gd_stochastic = stochastic_gd_lr(x, y, lr_stochastic, iterations_stochastic)

print("Analytic solution:")
print("SSE:", sse_analytic)
print("R squared:", r_sq_analytic)

print("\nFull-batch Gradient Descent:")
print("SSE:", sse_gd_full)
print("R squared:", r_sq_gd_full)

print("\nStochastic Gradient Descent:")
print("SSE:", sse_gd_stochastic)
print("R squared:", r_sq_gd_stochastic)


Analytic solution:
SSE: 5.624242424242423
R squared: 0.952538038613988

Full-batch Gradient Descent:
SSE: 5.624278989977716
R squared: 0.9525377300423822

Stochastic Gradient Descent:
SSE: 7.575559791810393
R squared: 0.9360712253855663


2. Download Boston Housing Rate Dataset. Analyse the input attributes and find out the
attribute that best follow the linear relationship with the output price. Implement both the
analytic formulation and gradient descent (Full-batch, stochastic) on LMS loss
formulation to compute the coefficients of regression matrix and compare the results.

In [9]:
import numpy as np
import pandas as pd

df = pd.read_csv('HousingData.csv', skiprows=1, header=None, names=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'], na_values='NA')
df.dropna(inplace=True)

def lr_analytic(X, y):
    X = np.column_stack((np.ones(len(X)), X))
    coeffs = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    return coeffs

def lr_gd(X, y, lr, num_iter):
    n_s, n_f = X.shape
    coeffs = np.zeros(n_f)
    
    for _ in range(num_iter):
        error = np.dot(X, coeffs) - y
        gradient = np.dot(X.T, error) / n_s
        coeffs -= lr * gradient
    
    return coeffs

def lr_sgd(X, y, lr, num_iter):
    n_s, n_f = X.shape
    coeffs = np.zeros(n_f)
    
    for _ in range(num_iter):
        for i in range(n_s):
            error = np.dot(X[i], coeffs) - y[i]
            gradient = X[i] * error
            coeffs -= lr * gradient
    
    return coeffs

corr = df.corr()['MEDV'].abs().sort_values(ascending=False)
best_attr = corr.index[1]

X_data = df[best_attr].values.reshape(-1, 1)
y_data = df['MEDV'].values

lr = 0.0001
num_iter = 1000

coeff_analytic = lr_analytic(X_data, y_data)
coeff_gd = lr_gd(X_data, y_data, lr, num_iter)
coeff_sgd = lr_sgd(X_data, y_data, lr, num_iter)

print("Best attribute:", best_attr)
print("Analytic coefficients:", coeff_analytic)
print("Gradient Descent (Full-batch) coefficients:", coeff_gd)
print("Stochastic Gradient Descent coefficients:", coeff_sgd)

Best attribute: LSTAT
Analytic coefficients: [34.23579926 -0.93006897]
Gradient Descent (Full-batch) coefficients: [1.09075655]
Stochastic Gradient Descent coefficients: [0.89061244]
