In [24]:
import numpy as np
import os

In [25]:
def load_data():
    
    def load_csv(filename):
        raw_data = open(filename, 'rt')
        data = np.loadtxt(raw_data, delimiter=",")
        return data

    pwd = os.getcwd() + '/dataset'
    train_data_path   = pwd + "/trainData.csv"
    train_labels_path = pwd + "/trainLabels.csv"
    val_data_path     = pwd + "/valData.csv"
    val_labels_path   = pwd + "/valLabels.csv"
    
    trainX = load_csv(train_data_path)
    trainY = load_csv(train_labels_path)
    valX   = load_csv(val_data_path)
    valY   = load_csv(val_labels_path)
    
    return trainX, trainY, valX, valY

In [60]:
def preprocess(X, Y):
    X = X[:, 1:].T  #removing 1st column and then transposing
    X = np.vstack((X, np.ones((1, X.shape[1])))) #adding one at the end of each datapoint
    Y = Y[:, 1:]
    print(X.shape, Y.shape)
    return X,Y

In [78]:
def train(X, Y, l):
    n_features = X.shape[0]
    I = np.identity(n_features)
    I[n_features-1][n_features-1]=0
    C = np.matmul(X, X.T) + l*I
    Ci = np.linalg.pinv(C)
    w = np.matmul(Ci, np.matmul(X, Y))
    
    eff_error = 0
    for i in range(X.shape[1]):
        xi = X[:, i]
        a = np.matmul(w.T, xi) - Y[i];
        b = 1-np.matmul(xi.T, np.matmul(Ci, xi));
        er = a/b;
        eff_error = eff_error + er**2;
    loocv_err = (eff_error/X.shape[1])
    
    return w, loocv_err

In [62]:
train_raw_X, train_raw_Y, val_raw_X, val_raw_Y = load_data()

In [63]:
train_X, train_Y = preprocess(train_raw_X, train_raw_Y)
val_X, val_Y = preprocess(val_raw_X, val_raw_Y)

(3001, 5000) (5000, 1)
(3001, 5000) (5000, 1)


In [80]:
lambdax = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
min_lambda = None
min_err = np.inf
for l in lambdax:
    w,loocv_err = train(train_X, train_Y, l)
    rmse_train = np.sqrt(np.average(np.square(train_Y-np.matmul(train_X.T,w))))
    rmse_val = np.sqrt(np.average(np.square(val_Y-np.matmul(val_X.T,w))))
    print("With lambda: ", l, "rmse_train: ", rmse_train, "rmse_val: ", rmse_val, "loocv_err: ", loocv_err)
    if loocv_err < min_err:
        min_lambda = l
        min_err = loocv_err

With lambda:  0.001 rmse_train:  1.1111365773926458 rmse_val:  2.76489507731023 loocv_err:  [7.60912046]
With lambda:  0.01 rmse_train:  1.1205188247625513 rmse_val:  2.5791868257079433 loocv_err:  [6.65693418]
With lambda:  0.1 rmse_train:  1.223811965696012 rmse_val:  2.1574812686460647 loocv_err:  [4.76158877]
With lambda:  1 rmse_train:  1.5780360753182296 rmse_val:  1.996771511369132 loocv_err:  [4.037986]
With lambda:  10 rmse_train:  2.1899533957516115 rmse_val:  2.3477055139162477 loocv_err:  [5.38302719]
With lambda:  100 rmse_train:  2.970941965610813 rmse_val:  3.0171052386646826 loocv_err:  [8.97944806]
With lambda:  1000 rmse_train:  3.3316125921530393 rmse_val:  3.345415090847553 loocv_err:  [11.12449278]


In [81]:
print("Best model is with lambda: ", min_lambda, "with loocv error: ", min_err)

Best model is with lambda:  1 with loocv error:  [4.037986]
