# 3. Cross Validation And L2 Regularization

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

* define functions

In [5]:
# data_reading function
def read_input(path):
    return pd.read_table(path)


In [6]:
# computer the cost function(square error)
def computeCost(X1,Y,w,lbd):
    """
    input: features matrix X1(with column of all 1's) and object vector Y, weight vector w
    X1: m x n
    Y: m x 1
    w: n x 1
    """
    m = len(Y)
    cost = ((np.linalg.norm((np.dot(X1,w)-Y)))**2)/(2*m)  
    return cost

* weight updating function, using normal equation $ \vec{w} = \left(X^TX + \lambda \begin{bmatrix} 0 & 0 \\ 0 & I_{n-1} \end{bmatrix}\right)^{-1}X^TY$.

In [7]:
def normaleq(X1,Y,lbd):
    """
    X1,Y: same as above
    lbd: regularization parameter lambda
    """
    m,n = X1.shape
    I = np.eye(n,n)
    I[0,0] = 0
    w = np.linalg.inv(X1.T@X1+lbd*I)@X1.T@Y
    return w

In [8]:
#split data, return the indeies
def splitdata(data,k):
    """
    data: type DataFrame
    k: k-folder split
    return: array of length 10, each element store the indies of each splitted data.
    """
    m, n = data.shape
    a = []
    inter = m//k
    start = 0
    end = inter
    for i in range(k-1):
        a.append(np.arange(start,end))
        start += inter
        end += inter
    end = m
    a.append(np.arange(start,end))
    return np.asarray(a)

In [9]:
if __name__ == '__main__':    
    trainpath = "crime-train.txt"
    testpath = 'crime-test.txt'
    
    # load data
    df_train = read_input(trainpath)
    df_test = read_input(testpath)

* k-fold split the training data

In [11]:
    k = 10
    
    #indeies of splited data
    indies = splitdata(df_train,k)

In [None]:
    # lambda sampled from logspace(power_s, power_e, num), weighthist record all the weight for different train set and different lambda
    weighthist = []
    power_s = -8
    power_e = 2
    num = 100

In [14]:
    #start training k hypothesis
    m,n = df_train.shape

* Problem 3(1)

In [15]:
    for i in range(k):
        weight = []
        train_data_t = np.asarray(df_train.iloc[indies[i],:])
        train_data = np.asarray(df_train.iloc[np.setxor1d(np.arange(0,m),indies[i]),:])

        Y = np.asarray(train_data[:,0:1],dtype=np.float32)
        X = np.asarray(train_data[:,1:],dtype=np.float32)
        
        m,n = X.shape
        n = n+1
        # add 1's column into features matrix X.
        X1 = np.concatenate((X,np.ones((m,1),float)),axis = 1)
        
        test_Y = np.asarray(train_data_t[:,0:1],dtype=np.float32)
        test_X = np.asarray(train_data_t[:,1:],dtype=np.float32)
        m1,n1 = test_X.shape
        n1 = n1+1
        #    add 1's column into features matrix X.
        test_X1 = np.concatenate((test_X,np.ones((m1,1),float)),axis = 1)     
        
        # max loop number
        loop_max = 2e5   
        #threshold to terminate program.
        epsilon = 1e-4
        # initial parameter w
        w = np.zeros((n,1),float) 
        
        # regularization parameter
        error = []
        for lbd in np.logspace(power_s,power_e,num):
            
            w = normaleq(X1,Y,lbd)
            weight.append(w)
            error.append(computeCost(test_X1,test_Y,w,lbd))

        weighthist.append(weight)
        # Figures for problem (1) 
        plt.figure(1)
        x = np.linspace(power_s,power_e,num)
        plt.xlabel('log($\lambda$)')
        plt.ylabel('square  error in the training data')
        plt.plot(x,error,label='train set '+ str(i))
        plt.legend(loc='upper left')