# Lab 5

In [2]:
"""
Name: Yi Qian Goh
Date: 10/24/2024
Class: CSEN140 Machine Learning and Data Mining
Brief Description: Implements Linear Regression and Ridge Regression in Python without any machine learning libraries. 
"""
import numpy as np

def txt_to_array(filename):
    with open(filename, 'r') as file:
        next(file)
        lines = file.readlines() # skips the first line of the txt file
        
    data = [list(map(float, line.strip().split())) for line in lines]
    #print(data)
    data = np.array(data)
    y = data[:, 0].reshape(-1, 1)
    X = data[:, 1:]
    
    # add column of dummy value 1 to the end of X, this accomodates the bias term in w
    dummy_col = np.ones(len(X))
    X_dummy = np.column_stack((X, dummy_col))

    # y[n * 1]   (rows * cols)
    # X[n * (p+1)]
    return X_dummy, y

# RMSE (Root Mean Square Error)
def RMSE(actual, predicted):
    # sqrt((sum((y - y_hat)^2))/n)
    n = len(actual)
    difference_sq = (predicted - actual)**2
    rmse = np.sqrt(np.sum(difference_sq)/n)
    return rmse

# Linear regression loss function derived with respect to w. Returns the w (weights) of each instance.
def linRegTrain(X, y):
    # w = (X^T * X)^(-1) * X^T * y
    w = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)
    return w

# Used with Linear Regression trained weight values. Returns matrix of size n * 1 that contains the predicted y values.
def problem1(X, w):
    return np.dot(X, w)

# Ridge regression loss function derived with respect to w. Returns the w (weights) of each instance.
def ridRegTrain(X, y, lamda):
    # w = ((X^T * X) + (lambda * I))^(-1) * (X^T * y) 
    left = np.linalg.inv(np.dot(X.T, X) + (lamda * np.identity(X.shape[1])))  # ((X^T * X) + (lambda * I))^(-1)
    right = np.dot(X.T, y)  # (X^T * y)
    w = np.dot(left, right)
    return w

# Used with Ridge Regression trained weight values. Returns predicted values.
def problem2(X, w):
    return np.dot(X, w)


print("------------------Linear Regression---------------------")
X_train, y_train = txt_to_array('crime-train.txt')
#print("X_train:\n", X_train)
#print("y_train:\n", y_train)
w_train = linRegTrain(X_train, y_train)
#print("w_train:\n", w_train)
train_pred = problem1(X_train, w_train)
#print("train_pred:\n", train_pred)
train_rmse = RMSE(y_train, train_pred)
print("RMSE of Training Set: ", train_rmse)

X_test, y_test = txt_to_array('crime-test.txt')
#print("X_test:\n", X_test)
#print("y_test:\n", y_test)
test_pred = problem1(X_test, w_train)
#print("test_pred:\n", test_pred)
test_rmse = RMSE(y_test, test_pred)
print("RMSE of Test Set    : ", test_rmse)

print("\n------------------Ridge Regression---------------------")
X_train_2, y_train_2 = txt_to_array('crime-train.txt')
w_train_2 = ridRegTrain(X_train_2, y_train_2, 100)
train_pred_2 = problem2(X_train_2, w_train_2)
train_rmse_2 = RMSE(y_train_2, train_pred_2)
print("RMSE of Training Set: ", train_rmse_2)

X_test_2, y_test_2 = txt_to_array('crime-test.txt')
test_pred_2 = problem2(X_test_2, w_train_2)
test_rmse_2 = RMSE(y_test_2, test_pred_2)
print("RMSE of Test Set    : ", test_rmse_2)


------------------Linear Regression---------------------
RMSE of Training Set:  0.12768967421762195
RMSE of Test Set    :  0.14583464490949627

------------------Ridge Regression---------------------
RMSE of Training Set:  0.13134320424615792
RMSE of Test Set    :  0.14765698468526103
