## HW2 - Logistic Regression, SVM, Kernels, Duality

## Housing Dataset (Linear Regression + Normal Equation)

In [None]:
# housing data set
import numpy as np

train = np.loadtxt("housing_data/train.txt")
test = np.loadtxt("housing_data/test.txt")

train

In [None]:
train.shape

In [None]:
# slicing train into X (features) and y (labels)
X = train[:, :-1]
y = train[:, -1]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# adding bias to the columns
ones_col = np.ones((X.shape[0], 1))
X_bias = np.hstack([ones_col, X])
X_bias.shape

In [None]:
X_bias

Using normal equation to calculate best weights ($\theta$):

$\theta = (X^T X) ^ {-1} X^T Y$

In [None]:
# taking transpose:
X_t = X_bias.T

# multiplying X_t and X:
X_dot = np.dot(X_t, X_bias)

# taking inverse:
X_inv_1 = np.linalg.inv(X_dot)

# calculation for second term
Xy_dot = np.dot(X_t, y)

best_weights = np.dot(X_inv_1, Xy_dot)
best_weights

In [None]:
# Evaluation - using MSE
y_pred = np.dot(X_bias, best_weights)

# mse calc:
mse = np.mean((y_pred - y)**2)
mse

In [None]:
print("First 5 predictions:", y_pred[:5])
print("First 5 actual:", y[:5])
print("Training MSE:", mse)

In [None]:
# checking test MSE scores: (Test set evaluation) (test.txt)
X_test = test[:, :-1]
y_test = test[:,-1]

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
# adding bias to feature column:
test_ones = np.ones((test.shape[0], 1))
testX_bias = np.hstack([test_ones, X_test])

In [None]:
testX_bias.shape

In [None]:
# prediction of test
test_pred = np.dot(testX_bias, best_weights)
test_pred

In [None]:
test_pred.shape

In [None]:
# mse for test:
mse_test = np.mean((test_pred-y_test) ** 2)
mse_test

In [None]:
np.min(y_test)

In [None]:
np.max(y_test)

In [None]:
np.std(y_test)

### Question 1A - Normalized Housing Dataset

In [None]:
house_train = np.loadtxt("housing_data/train.txt")
house_test = np.loadtxt("housing_data/test.txt")

house_train.shape

In [None]:
# separating features and labels
X_train = house_train[:, :-1]
y_train = house_train[:, -1]
X_test = house_test[:, :-1]
y_test = house_test[:, -1]

In [None]:
# applying normalization -
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)

X_train_normalized = (X_train - train_mean) / train_std
X_test_normalized = (X_test - train_mean) / train_std

X_train_normalized[:3]

In [None]:
# adding bias to X_train and X_test
ones_col_Xtrain = np.ones((X_train_normalized.shape[0], 1))
ones_col_Xtest = np.ones((X_test_normalized.shape[0], 1))

houseXtrain_bias = np.hstack([ones_col_Xtrain, X_train_normalized])
houseXtest_bias = np.hstack([ones_col_Xtest, X_test_normalized])

In [None]:
# using normal equation:

houseXtrain_transpose = houseXtrain_bias.T

houseXtrain_dot = np.dot(houseXtrain_transpose, houseXtrain_bias)

houseXtrain_inverse = np.linalg.inv(houseXtrain_dot)

houseXtrain_y_dot = np.dot(houseXtrain_transpose, y_train)

house_train_theta = np.dot(houseXtrain_inverse, houseXtrain_y_dot)
house_train_theta

In [None]:
house_train_pred = np.dot(houseXtrain_bias, house_train_theta)

house_test_pred = np.dot(houseXtest_bias, house_train_theta)

In [None]:
# errors:
house_train_mse = np.mean((house_train_pred - y_train) ** 2)
house_test_mse = np.mean((house_test_pred - y_test) ** 2)

In [None]:
print(f"Housing dataset training mse: {house_train_mse}")

In [None]:
print(f"Housing dataset testing mse: {house_test_mse}")

### Question1A - Spam Dataset Work
with normalization , kfold cross validation and binary predictions

In [None]:
# SPAM dataset loading
data = np.loadtxt("spambase/spambase.data", delimiter=",")

In [None]:
data

In [None]:
data.shape

In [None]:
X = data[:, :-1]
y = data[:,-1]

ones_col = np.ones((X.shape[0], 1))
spamX_bias = np.hstack([ones_col, X])

spamX_bias.shape

In [None]:
spamX_bias[:5]

In [None]:
# applying normal equation
# taking transpose:
spamX_t = spamX_bias.T

# multiplying X_t and X:
spamX_dot = np.dot(spamX_t, spamX_bias)

# taking inverse:
spamX_inv_1 = np.linalg.inv(spamX_dot)

# calculation for second term
spamXy_dot = np.dot(spamX_t, y)

best_weights = np.dot(spamX_inv_1, spamXy_dot)
best_weights

In [None]:
spamy_pred = np.dot(spamX_bias, best_weights)

# threshold :
threshold = 0.431
binary_pred = (spamy_pred > threshold).astype(int)

binary_pred.shape

In [None]:
# accuracy evaluation
accuracy = np.mean(binary_pred == y)
accuracy

In [None]:
spam_min = np.min(spamy_pred)
spam_max = np.max(spamy_pred)

In [None]:
spam_min

In [None]:
spam_max

### K-Fold cross validation working on Spam dataset

In [None]:
data = np.loadtxt("spambase/spambase.data", delimiter=",")

X = data[:, :-1]
y = data[:,-1]

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

train_fold_accuracies = []
test_fold_accuracies = []

for train_index, test_index in kf.split(X):
    X_train_fold = X[train_index]
    X_test_fold = X[test_index]

    y_train_fold = y[train_index]
    y_test_fold = y[test_index]

    # adding normalization:
    train_mean = np.mean(X_train_fold, axis=0)
    train_std = np.std(X_train_fold, axis=0)

    X_train_normalized = (X_train_fold - train_mean) / train_std
    X_test_normalized = (X_test_fold - train_mean) / train_std

    # adding bias:
    ones_col_Xtrain = np.ones((X_train_normalized.shape[0], 1))
    ones_col_Xtest = np.ones((X_test_normalized.shape[0], 1))

    spam_Xtrain_bias = np.hstack([ones_col_Xtrain, X_train_normalized])
    spam_Xtest_bias = np.hstack([ones_col_Xtest, X_test_normalized])

    # model training
    spamXtrain_T = spam_Xtrain_bias.T

    spamXtrain_dot = np.dot(spamXtrain_T, spam_Xtrain_bias)

    spamXtrain_inverse = np.linalg.inv(spamXtrain_dot)

    spamX_y_dot = np.dot(spamXtrain_T, y_train_fold)

    best_weights_X = np.dot(spamXtrain_inverse, spamX_y_dot)

    # predictions:
    train_continuous_predictions = np.dot(spam_Xtrain_bias, best_weights_X)
    test_continuous_predictions = np.dot(spam_Xtest_bias, best_weights_X)

    # threshold for binary classification:
    threshold = 0.43
    test_binary_pred = (test_continuous_predictions > threshold).astype(int)
    train_binary_pred = (train_continuous_predictions > threshold).astype(int)


    # accuracy:
    test_fold_accuracy = np.mean(test_binary_pred == y_test_fold)
    train_fold_accuracy = np.mean(train_binary_pred == y_train_fold)
    test_fold_accuracies.append(test_fold_accuracy)
    train_fold_accuracies.append(train_fold_accuracy)

test_mean_accuracy = np.mean(test_fold_accuracies)
train_mean_accuracy = np.mean(train_fold_accuracies)

print(f"Testing accuracy: {test_mean_accuracy * 100}")
print(f"Training accuracy: {train_mean_accuracy * 100}")

### Question 1B - L2 Regularization

In [None]:
# loading datasets both housing and spam
import numpy as np
house_data_train = np.loadtxt("housing_data/train.txt")
house_data_test = np.loadtxt("housing_data/test.txt")

spam_data = np.loadtxt("spambase/spambase.data", delimiter=",")

first going with housing dataset

In [None]:
# splitting data into train and test
house_X = house_data_train[:,:-1]
house_y = house_data_train[:, -1]

In [None]:
# normalization step:
house_train_mean = np.mean(house_X, axis=0)
house_train_std = np.std(house_X, axis=0)

house_X_norm = (house_X-house_train_mean) / house_train_std
house_X_norm[:3]

In [None]:
# adding bias
ones_col = np.ones((house_X_norm.shape[0], 1))
house_X_with_bias = np.hstack([ones_col, house_X_norm])

house_X_with_bias[:3]

L2 Regularization Equation:
$ \theta = (X^TX + \lambda I) X^T Y $

In [None]:
# using normal equation
lambda_1 = 0.43

house_X_transpose = house_X_with_bias.T

dot_X = np.dot(house_X_transpose, house_X_with_bias)  # first term

I_house = np.eye(house_X_with_bias.shape[1])

dot_lambda = lambda_1 * I_house # second term

add_X_lambda = dot_X + dot_lambda

inverse_X_with_lambda = np.linalg.inv(add_X_lambda)

XTy = np.dot(house_X_transpose, house_y)

house_ridge_weights = np.dot(inverse_X_with_lambda, XTy)

house_ridge_weights

In [None]:
train_eval_mse = np.dot(house_X_with_bias, house_ridge_weights)

final_mse_1 = np.mean((train_eval_mse - house_y)**2)

In [None]:
print(f"Training MSE: {final_mse_1}") # training mse

In [None]:
# test mse working:
house_X_test = house_data_test[:,:-1]
house_y_test = house_data_test[:,-1]

house_X_test_norm = (house_X_test - house_train_mean) / house_train_std

ones_col_test = np.ones((house_X_test_norm.shape[0], 1))
house_X_test_bias = np.hstack([ones_col_test, house_X_test_norm])

house_pred_test = np.dot(house_X_test_bias, house_ridge_weights)
mse_house_test = np.mean((house_pred_test - house_y_test) ** 2)
print(f"Testing MSE: {mse_house_test}")

Spam dataset Working - L2 Regularization :  $ \theta = (X^TX + \lambda I) X^T Y $

In [None]:
from sklearn.model_selection import KFold
import numpy as np

# Define lambda values to test
lambda_values = np.logspace(-2, 1, 7)
results = []

kf = KFold(n_splits=10, shuffle=True, random_state=42)

for lambda_val in lambda_values:
    print(f"Testing λ = {lambda_val}")

    train_fold_accuracies = []
    test_fold_accuracies = []

    for train_index, test_index in kf.split(X):
        X_train_fold = X[train_index]
        X_test_fold = X[test_index]
        y_train_fold = y[train_index]
        y_test_fold = y[test_index]

        # normalization:
        train_mean = np.mean(X_train_fold, axis=0)
        train_std = np.std(X_train_fold, axis=0)
        X_train_normalized = (X_train_fold - train_mean) / train_std
        X_test_normalized = (X_test_fold - train_mean) / train_std

        # adding bias:
        ones_col_Xtrain = np.ones((X_train_normalized.shape[0], 1))
        ones_col_Xtest = np.ones((X_test_normalized.shape[0], 1))
        spam_Xtrain_bias = np.hstack([ones_col_Xtrain, X_train_normalized])
        spam_Xtest_bias = np.hstack([ones_col_Xtest, X_test_normalized])

        # Ridge regression
        spamXtrain_T = spam_Xtrain_bias.T
        spamXtrain_dot = np.dot(spamXtrain_T, spam_Xtrain_bias)
        I_lambda = np.eye(spamXtrain_dot.shape[1])
        add_X_lambda = spamXtrain_dot + lambda_val * I_lambda  # Use current lambda
        spamXtrain_inverse = np.linalg.inv(add_X_lambda)
        spamX_y_dot = np.dot(spamXtrain_T, y_train_fold)
        best_weights_X = np.dot(spamXtrain_inverse, spamX_y_dot)

        # predictions and accuracy
        train_continuous_pred = np.dot(spam_Xtrain_bias, best_weights_X)
        test_continuous_pred = np.dot(spam_Xtest_bias, best_weights_X)

        threshold = 0.43
        test_binary_pred = (test_continuous_pred > threshold).astype(int)
        train_binary_pred = (train_continuous_pred > threshold).astype(int)

        test_fold_accuracy = np.mean(test_binary_pred == y_test_fold)
        train_fold_accuracy = np.mean(train_binary_pred == y_train_fold)

        test_fold_accuracies.append(test_fold_accuracy)
        train_fold_accuracies.append(train_fold_accuracy)

    # Average across folds
    avg_train = np.mean(train_fold_accuracies)
    avg_test = np.mean(test_fold_accuracies)

    results.append((lambda_val, avg_train, avg_test))
    print(f"λ={lambda_val}: Train={avg_train:.3f}, Test={avg_test:.3f}")

In [None]:
train_accuracies = [result[1] for result in results]
test_accuracies = [result[2] for result in results]

print(f"\nSUMMARY STATISTICS:")
print(f"Average Training Accuracy: {np.mean(train_accuracies)*100:.1f}%")
print(f"Average Test Accuracy: {np.mean(test_accuracies)*100:.1f}%")
print(f"Best Test Accuracy: {np.max(test_accuracies)*100:.1f}%")
print(f"Average Generalization Gap: {np.mean([r[1]-r[2] for r in results])*100:.1f}%")

In [None]:
train_accuracies = [result[1] for result in results]
test_accuracies = [result[2] for result in results]

print(f"\nSUMMARY STATISTICS:")
print(f"Average Training Accuracy: {np.mean(train_accuracies)*100:.1f}%")
print(f"Average Test Accuracy: {np.mean(test_accuracies)*100:.1f}%")
print(f"Average Generalization Gap: {np.mean([r[1]-r[2] for r in results])*100:.1f}%")

best_test_idx = np.argmax(test_accuracies)
worst_test_idx = np.argmin(test_accuracies)

print(f"\nBEST PERFORMANCE:")
print(f"Lambda: {results[best_test_idx][0]:.3f}")
print(f"Train Accuracy: {results[best_test_idx][1]*100:.1f}%")
print(f"Test Accuracy: {results[best_test_idx][2]*100:.1f}%")

print(f"\nWORST PERFORMANCE:")
print(f"Lambda: {results[worst_test_idx][0]:.3f}")
print(f"Train Accuracy: {results[worst_test_idx][1]*100:.1f}%")
print(f"Test Accuracy: {results[worst_test_idx][2]*100:.1f}%")