## Import and Preprocess Data

In [None]:
import numpy as np
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# Method for reading in the "pickled" object images
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# Preprocessing- convert to greyscale, normalize values between 0-1
def rgb2gray(im):
    col_size = len(im[:,0])
    im_out = np.empty([col_size,1024])
    
    for i in range(0,col_size):
        for j in range(0,1024):
            r = im[i,j] 
            g = im[i,j+1024]
            b = im[i,j+2048]
            im_out[i,j] = (0.2989 * r + 0.5870 * g + 0.1140 * b) / 255
    
    return im_out

# Each data_batch is a dictionary with the following items
# b'batch_label --> specifies which batch it is
# b'labels --> array of 10,000 labels 0-9 correspoding to the correct classification
# b'data --> 10,000 x 3072 array of uint8 pixels, each rows is a 32x32 image with the first 1024 entries being the red,
#            the second 1024 entries being the green, and the last 1024 entries being the blue

db1_labels = data_batch_1[b'labels']
db1_data = data_batch_1[b'data']
db2_labels = data_batch_2[b'labels']
db2_data = data_batch_2[b'data']
db3_labels = data_batch_3[b'labels']
db3_data = data_batch_3[b'data']
db4_labels = data_batch_4[b'labels']
db4_data = data_batch_4[b'data']
db5_labels = data_batch_5[b'labels']
db5_data = data_batch_5[b'data']
db6_labels = test_batch[b'labels']
db6_data = test_batch[b'data']
tb_labels = test_batch[b'labels']
tb_data = test_batch[b'data']


db1_data = rgb2gray(db1_data)
db2_data = rgb2gray(db2_data)
db3_data = rgb2gray(db3_data)
db4_data = rgb2gray(db4_data)
db5_data = rgb2gray(db5_data)
db6_data = rgb2gray(db6_data)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

## Ridge Regression Method

In [None]:
def ridge_regression(A, d, T1, T2, y1, y2, lambdas):

    num_iterations = len(lambdas)
    training_errors = np.zeros(num_iterations)
    
    ws = []
    
    # Perform the training over all the different lambdas
    for lam in range(0, num_iterations):
        w1 = np.linalg.inv(A.T @ A + lambdas[lam] * np.identity(len(A[0,:]))) 
        w2 = A.T @ d
        w = w1 @ w2

        ws.append(w)

        # Find the predictions for the first test set
        t_hat = T1 @ w
        error_count = 0

        # Record the number of errors
        for i in range(0, len(t_hat)):
            if abs(round(t_hat[i,0])) != y1[i]:
                error_count += 1
        
        training_errors[lam] = error_count
    
    # Determine which lambda gave the lowest error rates
    min_idx = 0
    min_error = 50000

    for i in range(0,num_iterations):
        if training_errors[i] < min_error:
            min_idx = i
            min_error = training_errors[i]
    

    # Use the selected lambda with the rest of the training data to get w
    lam = lambdas[min_idx]
    print("Lambda Chosen: " + str(lam))

    w = ws[min_idx]

    # Find the predictions for the second test set
    y_hat = T2 @ w
    error_count = 0

    # Record the number of errors
    for i in range(0, len(y_hat)):
        if abs(round(y_hat[i,0])) != y2[i]:
            error_count += 1
    
    # Calculate the errors and return them
    error_rate = error_count / len(y2)
    squared_error = np.linalg.norm(y_hat - y2)**2

    return ([error_rate, squared_error])

## Testing

In [None]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# Iteration 1
lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db2_data, db3_data, db4_data, db5_data, db6_data))
d = np.column_stack((np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db1_data[0:5000,:]
T2 = db1_data[5000:,:]
y1 = db1_labels[0:5000]
y2 = db1_labels[5000:]

print("Ridge Regression Iteration 1")
[error_rate1, squared_error1] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)
print("Error Rate: " + str(round(error_rate1*100,3)) + ", Sqaured Error: " + str(round(squared_error1,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()


# Iteration 2
lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db3_data, db4_data, db5_data, db6_data))
d = np.column_stack((np.array(db1_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db2_data[0:5000,:]
T2 = db2_data[5000:,:]
y1 = db2_labels[0:5000]
y2 = db2_labels[5000:]

print("Ridge Regression Iteration 2")
[error_rate2, squared_error2] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)
print("Error Rate: " + str(round(error_rate2*100,3)) + ", Sqaured Error: " + str(round(squared_error2,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()


# Iteration 3
lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db4_data, db5_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db3_data[0:5000,:]
T2 = db3_data[5000:,:]
y1 = db3_labels[0:5000]
y2 = db3_labels[5000:]

print("Ridge Regression Iteration 3")
[error_rate3, squared_error3] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)
print("Error Rate: " + str(round(error_rate3*100,3)) + ", Sqaured Error: " + str(round(squared_error3,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()


# Iteration 4
lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db3_data, db5_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db4_data[0:5000,:]
T2 = db4_data[5000:,:]
y1 = db4_labels[0:5000]
y2 = db4_labels[5000:]

print("Ridge Regression Iteration 4")
[error_rate4, squared_error4] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)
print("Error Rate: " + str(round(error_rate4*100,3)) + ", Sqaured Error: " + str(round(squared_error4,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()


# Iteration 5
lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db3_data, db4_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db5_data[0:5000,:]
T2 = db5_data[5000:,:]
y1 = db5_labels[0:5000]
y2 = db5_labels[5000:]

print("Ridge Regression Iteration 5")
[error_rate5, squared_error5] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)
print("Error Rate: " + str(round(error_rate5*100,3)) + ", Sqaured Error: " + str(round(squared_error5,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()


# Iteration 6
lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db3_data, db4_data, db5_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels))).reshape(50000,1) #Known classifiers
T1 = db6_data[0:5000,:]
T2 = db6_data[5000:,:]
y1 = db6_labels[0:5000]
y2 = db6_labels[5000:]

print("Ridge Regression Iteration 6")
[error_rate6, squared_error6] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)
print("Error Rate: " + str(round(error_rate6*100,3)) + ", Sqaured Error: " + str(round(squared_error6,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()