## HW2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# housing dataset:
train_house = np.loadtxt("housing_data/train.txt")
test_house = np.loadtxt("housing_data/test.txt")

# spambase dataset:
spambase_data = np.loadtxt("spambase/spambase.data", delimiter=",")

Q1 A - Housing dataset with Linear Reg (normal eqns)

In [None]:
# 1. separate features and labels
X_train = train_house[:, :-1]
y_train = train_house[:, -1]
X_test = test_house[:, :-1]
y_test = test_house[:, -1]

# 2. Normalization step
scaler = StandardScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)

# 3. bias step
ones_col_Xtrain = np.ones((X_train_normalized.shape[0], 1))
ones_col_Xtest = np.ones((X_test_normalized.shape[0], 1))

Xtrain_house_bias = np.hstack([ones_col_Xtrain, X_train_normalized])
Xtest_house_bias = np.hstack([ones_col_Xtest, X_test_normalized])

# 4. Normal Eqn step
Xtrain_T = Xtrain_house_bias.T
Xtrain_dot = np.dot(Xtrain_T, Xtrain_house_bias)
Xtrain_inv = np.linalg.inv(Xtrain_dot)
Xtrain_y_dot = np.dot(Xtrain_T, y_train)
theta_1 = np.dot(Xtrain_inv, Xtrain_y_dot)

# 5. Prediction step
train_house_pred = np.dot(Xtrain_house_bias, theta_1)
test_house_pred = np.dot(Xtest_house_bias, theta_1)

# 6. MSE for linear reg with normal equations
train_mse_1 = np.mean((train_house_pred-y_train)**2)
test_mse_1 = np.mean((test_house_pred-y_test)**2)

print(f"TRAIN-MSE-1 Housing data with Linear Reg: {train_mse_1}")
print(f"TEST-MSE-1 Housing data with Linear Reg: {test_mse_1}")

Q1 B - Housing dataset with Linear Ridge Reg ->
$ \theta = (X^TX + \lambda I) X^T Y $

In [None]:
lambdas = np.arange(0, 2.1, 0.1)
test_mses = []
train_mses = []
# using normalized and bias data from Q1 A part

for lambda_ridge in lambdas:
    I = np.eye(Xtrain_house_bias.shape[1])
    I[0,0] = 0

    # ridge regression by normal equation
    Xtrain_T = Xtrain_house_bias.T
    Xtrain_dot_ridge = np.dot(Xtrain_T, Xtrain_house_bias) + lambda_ridge*I
    Xtrain_inv_ridge = np.linalg.inv(Xtrain_dot_ridge)
    Xtrain_y_dot = np.dot(Xtrain_T, y_train)
    theta_2 = np.dot(Xtrain_inv_ridge, Xtrain_y_dot)

    # predictions step
    train_house_pred_ridge = np.dot(Xtrain_house_bias, theta_2)
    test_house_pred_ridge = np.dot(Xtest_house_bias, theta_2)

    # MSE for housing data with linear ridge regression
    train_mse_2 = np.mean((train_house_pred_ridge-y_train)**2)
    test_mse_2 = np.mean((test_house_pred_ridge-y_test)**2)

    train_mses.append(train_mse_2)
    test_mses.append(test_mse_2)

    print(f"Lambda: {lambda_ridge:.1f} | Train MSE-2: {train_mse_2:.4f} | Test MSE-2: {test_mse_2:.4f}")

# best lambda based on test MSE
best_idx = np.argmin(test_mses)
best_lambda = lambdas[best_idx]
best_train_mse = train_mses[best_idx]
best_test_mse = test_mses[best_idx]

print("\n" + "="*60)
print(f"BEST Lambda: {best_lambda:.1f}")
print(f"Best Train MSE: {best_train_mse:.6f}")
print(f"Best Test MSE: {best_test_mse:.6f}")

Q1 C - Housing dataset with Linear Reg (Gradient Descent)

In [None]:
def gradient_descent_lin_reg(X, y, lr=0.1, epochs=1000):
    n,d = X.shape
    w = np.zeros(d)

    loss_history = []

    for epoch in range(epochs):
        predictions = X @ w
        error = predictions - y

        gradient = (1/n) * X.T @ error

        w = w-lr*gradient

        loss = np.mean(error**2)
        loss_history.append(loss)

        if epoch%100==0:
            print(f"Epoch {epoch}, MSE: {loss:.4f}")

    return w, loss_history

In [None]:
print("Train Linear Regression with Gradient Descent:")
w_gd, losses = gradient_descent_lin_reg(Xtrain_house_bias, y_train, lr=0.1, epochs=1000)

# predictions step:
train_pred_gd = Xtrain_house_bias @ w_gd
test_pred_gd = Xtest_house_bias @ w_gd

# MSE Calculation:
train_mse_3 = np.mean((train_pred_gd-y_train) ** 2)
test_mse_3 = np.mean((test_pred_gd-y_test) ** 2)

print(f"\n Final Train MSE: {train_mse_3:.6f}")
print(f"\n Final Test MSE: {test_mse_3:.6f}")

# Convergence Plot:
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('Gradient Descent Convergence')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(losses[10:])
plt.xlabel('Epoch (starting from 10)')
plt.ylabel('MSE')
plt.title('GD Convergence (after initial drop)')
plt.ylim(22.08, 22.5)
plt.grid(True, alpha=0.3)
plt.show()

Q1 D - Housing dataset with Logistic Reg (Gradient Descent) this won't work as for housing problem we need to predict continuous values ( house prices like 250k ,321k etc). But logistic regression gives us probabilities between 0 and 1 via the sigmoid function. It helps us predict discrete classes (spam/ not spam, pass/fail) not continuous values and is inappropriate for unbounded continuous values like house prices.

Q1 E - Spambase dataset with Linear Regression (Normal Equations)

In [None]:
X = spambase_data[:, :-1]
y = spambase_data[:, -1]

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# normalization step:
scalar_spam = StandardScaler()
scalar_spam.fit(X_train)
X_train_normalized = scalar_spam.transform(X_train)
X_test_normalized = scalar_spam.transform(X_test)

# add bias term:
ones_col_Xtrain = np.ones((X_train_normalized.shape[0], 1))
ones_col_Xtest = np.ones((X_test_normalized.shape[0], 1))
spam_Xtrain_bias = np.hstack([ones_col_Xtrain, X_train_normalized])
spam_Xtest_bias = np.hstack([ones_col_Xtest, X_test_normalized])

# Normal Equation step:
spamXtrain_T = spam_Xtrain_bias.T
spamXtrain_dot = spamXtrain_T @ spam_Xtrain_bias
spamXtrain_inverse = np.linalg.inv(spamXtrain_dot)
spamX_y_dot = spamXtrain_T @ y_train
w_spam_linear = spamXtrain_inverse @ spamX_y_dot

# predictions:
train_continuous_pred = spam_Xtrain_bias @ w_spam_linear
test_continuous_pred = spam_Xtest_bias @ w_spam_linear

In [None]:
# Binary classification with threshold:
threshold = 0.42
train_binary_pred = (train_continuous_pred>threshold).astype(int)
test_binary_pred = (test_continuous_pred>threshold).astype(int)

# Accuracy:
train_acc_linear = np.mean(train_binary_pred == y_train)
test_acc_linear = np.mean(test_binary_pred == y_test)

print(f"\nLinear Regression on Spambase:")
print(f"Train Accuracy: {train_acc_linear:.4f}")
print(f"Test Accuracy: {test_acc_linear:.4f}")

In [None]:
# Confusion Matrix
def confusion_matrix(y_true, y_pred):
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    return np.array([[TN, FP], [FN, TP]])

cm_linear = confusion_matrix(y_test, test_binary_pred)
print(f"\nConfusion Matrix (Linear Regression):")
print(f"[[TN={cm_linear[0,0]} FP={cm_linear[0,1]}]")
print(f" [FN={cm_linear[1,0]} TP={cm_linear[1,1]}]]")

Q1 F - Spambase dataset with Linear Ridge Regression

In [None]:
# Q1 F - Spambase dataset with Linear Ridge Regression

# Testing different lambda values with fixed threshold value
threshold = 0.42
lambdas = np.arange(0, 10.1, 0.1)
train_accs_ridge = []
test_accs_ridge = []

print("Testing Ridge Regression with different lambda values:")
print("-" * 50)

for lambda_ridge in lambdas:
    I = np.eye(spam_Xtrain_bias.shape[1])
    I[0, 0] = 0

    spamXtrain_T = spam_Xtrain_bias.T
    spamXtrain_dot_ridge = spamXtrain_T @ spam_Xtrain_bias + lambda_ridge * I
    spamXtrain_inv_ridge = np.linalg.inv(spamXtrain_dot_ridge)
    spamX_y_dot = spamXtrain_T @ y_train
    w_spam_ridge = spamXtrain_inv_ridge @ spamX_y_dot

    train_conti_ridge = spam_Xtrain_bias @ w_spam_ridge
    test_conti_ridge = spam_Xtest_bias @ w_spam_ridge

    train_binary_ridge = (train_conti_ridge > threshold).astype(int)
    test_binary_ridge = (test_conti_ridge > threshold).astype(int)

    train_acc = np.mean(train_binary_ridge == y_train)
    test_acc = np.mean(test_binary_ridge == y_test)

    train_accs_ridge.append(train_acc)
    test_accs_ridge.append(test_acc)

    print(f"λ={lambda_ridge:.1f}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

# best lambda search:
best_idx = np.argmax(test_accs_ridge)
best_lambda = lambdas[best_idx]
best_train_acc = train_accs_ridge[best_idx]
best_test_acc = test_accs_ridge[best_idx]

print("\n" + "="*50)
print(f"BEST Lambda: {best_lambda:.1f}")
print(f"Best Train Accuracy: {best_train_acc:.4f}")
print(f"Best Test Accuracy: {best_test_acc:.4f}")

# rerunning for-loop with the best lambda value found:
I = np.eye(spam_Xtrain_bias.shape[1])
I[0, 0] = 0
spamXtrain_dot_ridge = np.dot(spamXtrain_T, spam_Xtrain_bias) + best_lambda * I
spamXtrain_inv_ridge = np.linalg.inv(spamXtrain_dot_ridge)
w_spam_ridge_best = np.dot(spamXtrain_inv_ridge, spamX_y_dot)

test_pred_ridge = np.dot(spam_Xtest_bias, w_spam_ridge_best)
test_binary_ridge = (test_pred_ridge > threshold).astype(int)

In [None]:
# Confusion Matrix 2
cm_ridge = confusion_matrix(y_test, test_binary_ridge)
print(f"\nConfusion Matrix (Ridge Regression, λ={best_lambda}):")
print(f"[[TN={cm_ridge[0,0]} FP={cm_ridge[0,1]}]")
print(f" [FN={cm_ridge[1,0]} TP={cm_ridge[1,1]}]]")

Q1 G - Spambase dataset with Linear Regression using Gradient Descent

In [None]:
def gradient_descent_linear_spam(X, y, lr=0.011, epochs=1000):
    n, d = X.shape
    w = np.zeros(d)

    loss_history = []

    for epoch in range(epochs):
        y_pred = X @ w

        # calculate gradient
        error = y_pred - y
        gradient = (1/n) * X.T @ error

        # new weights are:
        w = w - lr * gradient

        # mse loss:
        loss = np.mean(error**2)
        loss_history.append(loss)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, MSE: {loss:.4f}")

    return w, loss_history

In [None]:
# training part:
w_spam_gd, losses_spam = gradient_descent_linear_spam(spam_Xtrain_bias, y_train, lr=0.011, epochs=1000)

# predictions:
train_pred_gd_spam = spam_Xtrain_bias @ w_spam_gd
test_pred_gd_spam = spam_Xtest_bias @ w_spam_gd

train_binary_gd = (train_pred_gd_spam > threshold).astype(int)
test_binary_gd = (test_pred_gd_spam > threshold).astype(int)

# accuracy metric:
train_acc_gd = np.mean(train_binary_gd == y_train)
test_acc_gd = np.mean(test_binary_gd == y_test)

print(f"\nLinear Regression (Gradient Descent) on Spambase:")
print(f"Train Accuracy: {train_acc_gd:.4f}")
print(f"Test Accuracy: {test_acc_gd:.4f}")

In [None]:
cm_gd = confusion_matrix(y_test, test_binary_gd)
print(f"\nConfusion Matrix (Linear Reg - Gradient Descent):")
print(f"[[TN={cm_gd[0,0]} FP={cm_gd[0,1]}]")
print(f" [FN={cm_gd[1,0]} TP={cm_gd[1,1]}]]")

In [None]:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(losses_spam)
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('GD Convergence - Spambase')
plt.grid(True, alpha=0.3)

In [None]:
plt.subplot(1, 2, 2)
plt.plot(losses_spam[50:])
plt.xlabel('Epoch (from 50)')
plt.ylabel('MSE')
plt.title('GD Convergence - Zoomed')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
print(f"\nComparison:")
print(f"Normal Equations: Test Acc = {test_acc_linear:.4f}")
print(f"Gradient Descent: Test Acc = {test_acc_gd:.4f}")

In [None]:
# Q1G - Linear Regression with Gradient Descent (with learning rate optimization)

learning_rates = np.arange(0, 0.01, 0.0001)
train_accs_gd = []
test_accs_gd = []

print("Testing different learning rates for Linear Regression (GD):")
print("-" * 50)

for lr in learning_rates:
    # Train with gradient descent
    n, d = spam_Xtrain_bias.shape
    w = np.zeros(d)

    # Run gradient descent
    for epoch in range(1000):
        y_pred = spam_Xtrain_bias @ w
        gradient = (1/n) * spam_Xtrain_bias.T @ (y_pred - y_train)
        w = w - lr * gradient

    # Make predictions
    train_pred = spam_Xtrain_bias @ w
    test_pred = spam_Xtest_bias @ w

    # Convert to binary
    train_binary = (train_pred > threshold).astype(int)
    test_binary = (test_pred > threshold).astype(int)

    # Calculate accuracy
    train_acc = np.mean(train_binary == y_train)
    test_acc = np.mean(test_binary == y_test)

    train_accs_gd.append(train_acc)
    test_accs_gd.append(test_acc)

    print(f"LR={lr:.4f}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

# Find best learning rate
best_idx = np.argmax(test_accs_gd)
best_lr = learning_rates[best_idx]
best_train_acc = train_accs_gd[best_idx]
best_test_acc = test_accs_gd[best_idx]

print("\n" + "="*50)
print(f"BEST Learning Rate: {best_lr:.4f}")
print(f"Best Train Accuracy: {best_train_acc:.4f}")
print(f"Best Test Accuracy: {best_test_acc:.4f}")

# Plot results
plt.figure(figsize=(8, 5))
plt.plot(learning_rates, train_accs_gd, 'b-', label='Train Accuracy')
plt.plot(learning_rates, test_accs_gd, 'r-', label='Test Accuracy')
plt.xlabel('Learning Rate')
plt.ylabel('Accuracy')
plt.title('Learning Rate vs Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Q1 H - Spambase dataset with Logistic Regression using Gradient descent

In [None]:
def sigmoid(z):
    return 1/(1+np.exp(-np.clip(z, -500, 500)))

def gradient_descent_logistic(X, y, lr=0.01, epochs=1000):
    n, d = X.shape
    w =np.zeros(d)
    loss_history = []

    for epoch in range(epochs):
        z = X @ w
        y_pred = sigmoid(z)

        # gradient
        gradient = (1/n) * X.T @ (y_pred - y)

        w = w - lr * gradient

        # log likelihood
        epsilon = 1e-7
        loss = -np.mean(y*np.log(y_pred+epsilon)+(1-y)*np.log(1-y_pred+epsilon))
        loss_history.append(loss)

        if epoch%100==0:
            print(f"Epoch {epoch}, Cross-Entropy Loss: {loss:.4f}")
    return w, loss_history

In [None]:
w_logistic, losses_logistic = gradient_descent_logistic(spam_Xtrain_bias, y_train, lr=0.5, epochs=1000)

train_pred_prob = sigmoid(spam_Xtrain_bias @ w_logistic)
test_pred_prob = sigmoid(spam_Xtest_bias @ w_logistic)

train_binary_logistic = (train_pred_prob > 0.5).astype(int)
test_binary_logistic = (test_pred_prob > 0.5).astype(int)

train_acc_logistic = np.mean(train_binary_logistic == y_train)
test_acc_logistic = np.mean(test_binary_logistic == y_test)

print(f"Train Accuracy: {train_acc_logistic:.4f}")
print(f"Test Accuracy: {test_acc_logistic:.4f}")

In [None]:
cm_logistic = confusion_matrix(y_test, test_binary_logistic)
print(f"\nConfusion Matrix (Logistic Regression):")
print(f"[[TN={cm_logistic[0,0]} FP={cm_logistic[0,1]}]")
print(f" [FN={cm_logistic[1,0]} TP={cm_logistic[1,1]}]]")

In [None]:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(losses_logistic)
plt.xlabel('Epoch')
plt.ylabel('Cross-Entropy Loss')
plt.title('Logistic Regression Convergence')
plt.grid(True, alpha=0.3)

In [None]:
print("\n" + "="*60)
print("FINAL COMPARISON ON SPAMBASE:")
print("-"*60)
print(f"Linear Regression (Normal Eq):  Test Acc = {test_acc_linear:.4f}")
print(f"Ridge Regression (λ=5.7):        Test Acc = {0.9121:.4f}")  # Your best
print(f"Linear Regression (GD):          Test Acc = {test_acc_gd:.4f}")
print(f"Logistic Regression (GD):        Test Acc = {test_acc_logistic:.4f}")

In [None]:
# ROC Curves and AUC
from sklearn.metrics import roc_curve, auc

# For Linear Regression
fpr_linear, tpr_linear, _ = roc_curve(y_test, test_continuous_pred)
auc_linear = auc(fpr_linear, tpr_linear)

# For Logistic Regression
fpr_logistic, tpr_logistic, _ = roc_curve(y_test, test_pred_prob)
auc_logistic = auc(fpr_logistic, tpr_logistic)

# Plot ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_linear, tpr_linear, label=f'Linear Regression (AUC = {auc_linear:.3f})')
plt.plot(fpr_logistic, tpr_logistic, label=f'Logistic Regression (AUC = {auc_logistic:.3f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves: Linear vs Logistic Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"AUC - Linear Regression: {auc_linear:.4f}")
print(f"AUC - Logistic Regression: {auc_logistic:.4f}")