# Question 1, Answer D


In [70]:
import numpy as np

#load data into a np array and return X vector and y values
def load_data(file_name):
    data = np.genfromtxt(file_name)
    return data[:, :2], data[:, 2] # X in first two columns, y in last column

#apply non linear transformation and return first k columns of the transformed Z matrix
def non_lin_trans(X, k):
    Z = np.array([(1, point[0], point[1], point[0] ** 2, point[1] ** 2, 
                   point[0] * point[1], np.absolute(point[0] - point[1]), np.absolute(point[0] + point[1])) for point in X])
    Z = Z[:, 0:k+1]
    return Z

#use pseudo inverse one step learning method to compute w
def run_linear_regression(Z, y):
    ZtZ = np.dot(Z.T, Z)
    pinv_Z = np.dot(np.linalg.inv(ZtZ), Z.T)
    w = np.dot(pinv_Z, y) # w = pseudo_inv(X)*y 
    return w

#compute classification error
def compute_error(w, Z, y):
    #for each 'point'/row z in Z, compute wTz and compare sign(wTz) with y
     # will contain 0 or 1, for correct/incorrect classification
    error_list = [(np.sign(w.T @ z) != y_point) for z, y_point in zip(Z, y)]
    return np.sum(error_list)/len(error_list)

In [71]:
k_list = [3, 4, 5, 6, 7]
E_val_list = []
w_list = []
for k in k_list:
    # loading the input data
    X_input, y_input = load_data('in.dta')

    Z = non_lin_trans(X_input, k)

    # splitting into 25 training points and 10 validation points
    Z_train = Z[:25, :]
    y_train = y_input[:25]
    
    w_lin = run_linear_regression(Z_train, y_train)
    w_list.append(w_lin)
    
    Z_val = Z[-10:, :]
    y_val = y_input[-10:]
    
    E_val = compute_error(w_lin, Z_val, y_val)
    E_val_list.append(E_val)

print(f'list of k\'s:\t\t\t {k_list}')
print(f'list of validation errors\'s: {E_val_list}')
print('As we can from this, the minimum validation error occurs at k = 6. Therefore the correct answer choice is D')

list of k's:			 [3, 4, 5, 6, 7]
list of validation errors's: [0.3, 0.5, 0.2, 0.0, 0.1]
As we can from this, the minimum validation error occurs at k = 6. Therefore the correct answer choice is D


# Question 2, Answer E

In [72]:
E_out_list = []
for i, k in enumerate(k_list):
    X_test, y_test = load_data('out.dta')
    
    Z_test = non_lin_trans(X_test, k)
    
    E_out = compute_error(w_list[i], Z_test, y_test)
    E_out_list.append(E_out)
print(f'list of k\'s:\t\t\t {k_list}')
print(f'list of out of sample errors: {E_out_list}')
print('As we can see, the lowest out of sample error occurs at k = 7. The correct answer choice is E')

list of k's:			 [3, 4, 5, 6, 7]
list of out of sample errors: [0.42, 0.416, 0.188, 0.084, 0.072]
As we can see, the lowest out of sample error occurs at k = 7. The correct answer choice is E


# Question 3, Answer D

In [73]:
k_list = [3, 4, 5, 6, 7]
E_val_list = []
w_list = []
for k in k_list:
    # loading the input data
    X_input, y_input = load_data('in.dta')

    Z = non_lin_trans(X_input, k)

    # splitting into 25 training points and 10 validation points
    Z_train = Z[-10:, :]
    y_train = y_input[-10:]
    
    w_lin = run_linear_regression(Z_train, y_train)
    w_list.append(w_lin)
    
    Z_val = Z[:25, :]
    y_val = y_input[:25]
    
    E_val = compute_error(w_lin, Z_val, y_val)
    E_val_list.append(E_val)

print(f'list of k\'s:\t\t\t {k_list}')
print(f'list of validation errors\'s: {E_val_list}')
print('As we can from this, the minimum validation error occurs at k = 6. Therefore the correct answer choice is D')

list of k's:			 [3, 4, 5, 6, 7]
list of validation errors's: [0.28, 0.36, 0.2, 0.08, 0.12]
As we can from this, the minimum validation error occurs at k = 6. Therefore the correct answer choice is D


# Question 4, Answer D

In [74]:
E_out_list = []
for i, k in enumerate(k_list):
    X_test, y_test = load_data('out.dta')
    
    Z_test = non_lin_trans(X_test, k)
    
    E_out = compute_error(w_list[i], Z_test, y_test)
    E_out_list.append(E_out)
print(f'list of k\'s:\t\t\t {k_list}')
print(f'list of out of sample errors: {E_out_list}')
print('As we can see, the lowest out of sample error occurs at k = 6. The correct answer choice is D')
print('The past 4 problems illustrate the notion that a higher number of samples in the validation set leads to a better estimate of out of sample error. However, overall performance goes down because the training set has now been reduced')

list of k's:			 [3, 4, 5, 6, 7]
list of out of sample errors: [0.396, 0.388, 0.284, 0.192, 0.196]
As we can see, the lowest out of sample error occurs at k = 6. The correct answer choice is D
The past 4 problems illustrate the notion that a higher number of samples in the validation set leads to a better estimate of out of sample error. However, overall performance goes down because the training set has now been reduced


# Question 5, Answer

In [76]:
E_out_1 = 0.084 # out of sample error of model selected from first val method

E_out_2 = 0.192 # out of sample error of model selected from second val method

print("{:.1f}, {:.1f}".format(E_out_1 - E_val_1, E_out_2 - E_val_2))


0.1, 0.2
