In [1]:
import numpy as np
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# Method for reading in the "pickled" object images
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def rgb2gray(im):
    col_size = len(im[:,0])
    im_out = np.empty([col_size,1024])
    
    for i in range(0,col_size):
        for j in range(0,1024):
            r = im[i,j] 
            g = im[i,j+1024]
            b = im[i,j+2048]
            im_out[i,j] = (0.2989 * r + 0.5870 * g + 0.1140 * b)
    
    return im_out

# Read in the datasets 5 training batches and 1 test batch, each has 10,000 images
data_batch_1 = unpickle('data_batch_1')
data_batch_2 = unpickle('data_batch_2')
data_batch_3 = unpickle('data_batch_3')
data_batch_4 = unpickle('data_batch_4')
data_batch_5 = unpickle('data_batch_5')
test_batch = unpickle('test_batch')

# Each data_batch is a dictionary with the following items
# b'batch_label --> specifies which batch it is
# b'labels --> array of 10,000 labels 0-9 correspoding to the correct classification
# b'data --> 10,000 x 3072 array of uint8 pixels, each rows is a 32x32 image with the first 1024 entries being the red,
#            the second 1024 entries being the green, and the last 1024 entries being the blue

db1_labels = data_batch_1[b'labels']
db1_data = data_batch_1[b'data']
db2_labels = data_batch_2[b'labels']
db2_data = data_batch_2[b'data']
db3_labels = data_batch_3[b'labels']
db3_data = data_batch_3[b'data']
db4_labels = data_batch_4[b'labels']
db4_data = data_batch_4[b'data']
db5_labels = data_batch_5[b'labels']
db5_data = data_batch_5[b'data']
db6_labels = test_batch[b'labels']
db6_data = test_batch[b'data']
tb_labels = test_batch[b'labels']
tb_data = test_batch[b'data']


db1_data = rgb2gray(db1_data)
db2_data = rgb2gray(db2_data)
db3_data = rgb2gray(db3_data)
db4_data = rgb2gray(db4_data)
db5_data = rgb2gray(db5_data)
db6_data = rgb2gray(db6_data)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 23:23:13
Current Time = 23:35:30


In [6]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# def ridge_regression(A1, A2, A3, A4, A5, d1, d2, d3, d4, d5, T1, T2, y1, y2, lambdas):
def ridge_regression(A1, d1, T1, T2, y1, y2, lambdas):
    print("Running Ridge Regression")
#     A = np.vstack((A1, A2, A3, A4, A5)) #Training matrix
#     d = np.vstack((d1, d2, d3, d4, d5)) #Known classifiers
    A = A1 #Training matrix
    d = d1 #Known classifiers

#     print(len(A[:,0])) # row size
#     print(len(A[0,:])) # col size

    num_iterations = len(lambdas)
    training_errors = np.zeros(num_iterations)
    
#     ws = [len(A[0,:],num_iterations]
#     ws = np.empty([len(A[0,:]),num_iterations])
#     ws = np.empty([num_iterations,len(A[0,:])])
    ws = []
    # Perform the training over all the different lambdas
    for lam in range(0, num_iterations):
        print("Iteration " + str(lam) + ", using lambda = " + str(lambdas[lam]))
        
#         w1 = A.T @ A 
#         print (len(w1))
#         w2 = np.linalg.inv(w1 + lambdas[lam] * np.identity(len(w1))) 
#         w = w2 @ A.T @ d
        w1 = np.linalg.inv(A.T @ A + lambdas[lam] * np.identity(len(A[0,:]))) 
        w2 = A.T @ d
        w = w1 @ w2
        print(len(w[:,0])) # row size
        print(len(w[0,:])) # col size
#         w = np.linalg.inv(A.T @ A + lambdas[lam] * np.identity(len(A[0,:]))) @ A.T @ d
#         ws[lam,:] = w
        ws.append(w)

        # Find the predictions for the first test set
        t_hat = T1 @ w
        error_count = 0

        # Record the number of errors
        for i in range(0, len(t_hat)):
            if abs(round(t_hat[i,0])) != y1[i]:
#                 if i < 5:
#                     print("y = " +str(y1[i]) + ", y_hat = "+str(abs(round(t_hat[i]))))
                error_count += 1
        
        training_errors[lam] = error_count
    
    # Determine which lambda gave the lowest error rates
    min_idx = 0
    min_error = 50000

    for i in range(0,num_iterations):
        if training_errors[i] < min_error:
            min_idx = i
            min_error = training_errors[i]
    

    # Use the selected lambda with the rest of the training data to get w
    lam = lambdas[min_idx]
    print("Lambda Chosen: " + str(lam))

#     w = np.linalg.inv(A.T @ A + lam * np.identity(len(A[0,:]))) @ A.T @ d
    w = ws[min_idx]

    # Find the predictions for the second test set
    y_hat = T2 @ w
    error_count = 0

    # Record the number of errors
    for i in range(0, len(y_hat)):
        if abs(round(y_hat[i,0])) != y2[i]:
#             if i < 5:
#                 print("y = " +str(y2[i]) + ", y_hat = "+str(abs(round(y_hat[i]))))
            error_count += 1
    
    print("HERE0")
    # Calculate the errors and return them
    error_rate = error_count / len(y2)
    print("HERE1")
    squared_error = np.linalg.norm(y_hat - y2)**2
    print("HERE2")

    return ([error_rate, squared_error])



# Logarithmic spaced lambdas
# TODO: how to determine the span of these?
lambdas = np.logspace(-6,np.log(40),num=10)
# lambdas = [0,0.25,0.5,0.75,1,2,4,8]

# [error_rate1, squared_error1] = ridge_regression(db2_data, db3_data, db4_data, db5_data, db6_data, 
#                                                     db2_labels, db3_labels, db4_labels, db5_labels, db6_labels, 
#                                                     db1_data[0:5000,:], db1_data[5000:10000,:], db1_labels[0:5000], 
#                                                     db1_labels[5000:10000], lambdas)

A = np.vstack((db1_data, db5_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db5_labels))).reshape(20000,1) #Known classifiers
# d = np.vstack((np.array(db1_labels).T, np.array(db5_labels).T)).T #Known classifiers
print(len(d[0,:])) #col
print(len(d[:,0])) # row size
T1 = db3_data[0:5000,:]
T2 = db3_data[5000:,:]
y1 = db3_labels[0:5000]
y2 = db3_labels[5000:]

print(len(T1[0,:]))
print(len(T1[:,0]))


[error_rate1, squared_error1] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)
print("Ridge Regression Iteration 1")
print("Error Rate: " + str(round(error_rate1*100,3)) + ", Sqaured Error: " + str(round(squared_error1,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 15:47:06
1
20000
1024
5000
Running Ridge Regression
Iteration 0, using lambda = 1e-06
1024
1
Iteration 1, using lambda = 1.192730051182656e-05
1024
1
Iteration 2, using lambda = 0.00014226049749941843
1024
1
Iteration 3, using lambda = 0.0016967837046375164
1024
1
Iteration 4, using lambda = 0.02023804914878204
1024
1
Iteration 5, using lambda = 0.2413852939706391
1024
1
Iteration 6, using lambda = 2.8790749403234144
1024
1
Iteration 7, using lambda = 34.33959200930656
1024
1
Iteration 8, using lambda = 409.57863334851737
1024
1
Iteration 9, using lambda = 4885.167443171
1024
1
Lambda Chosen: 409.57863334851737
HERE0
HERE1
HERE2
Ridge Regression Iteration 1
Error Rate: 89.24, Sqaured Error: 232355668.488

Current Time = 15:47:52


In [20]:
# def ridge_regression(A1, A2, A3, A4, A5, d1, d2, d3, d4, d5, T1, T2, y1, y2, lambdas):
def ridge_regression(A1, d1, T1, T2, y1, y2, lambdas):
#     print("Running Ridge Regression")
#     A = np.vstack((A1, A2, A3, A4, A5)) #Training matrix
#     d = np.vstack((d1, d2, d3, d4, d5)) #Known classifiers
    A = A1 #Training matrix
    d = d1 #Known classifiers

#     print(len(A[:,0])) # row size
#     print(len(A[0,:])) # col size

    num_iterations = len(lambdas)
    training_errors = np.zeros(num_iterations)
    
#     ws = [len(A[0,:],num_iterations]
#     ws = np.empty([len(A[0,:]),num_iterations])
#     ws = np.empty([num_iterations,len(A[0,:])])
    ws = []
    # Perform the training over all the different lambdas
    for lam in range(0, num_iterations):
#         print("Iteration " + str(lam) + ", using lambda = " + str(lambdas[lam]))
        
#         w1 = A.T @ A 
#         print (len(w1))
#         w2 = np.linalg.inv(w1 + lambdas[lam] * np.identity(len(w1))) 
#         w = w2 @ A.T @ d
        w1 = np.linalg.inv(A.T @ A + lambdas[lam] * np.identity(len(A[0,:]))) 
        w2 = A.T @ d
        w = w1 @ w2
#         print(len(w[:,0])) # row size
#         print(len(w[0,:])) # col size
#         w = np.linalg.inv(A.T @ A + lambdas[lam] * np.identity(len(A[0,:]))) @ A.T @ d
#         ws[lam,:] = w
        ws.append(w)

        # Find the predictions for the first test set
        t_hat = T1 @ w
        error_count = 0

        # Record the number of errors
        for i in range(0, len(t_hat)):
            if abs(round(t_hat[i,0])) != y1[i]:
#                 if i < 5:
#                     print("y = " +str(y1[i]) + ", y_hat = "+str(abs(round(t_hat[i]))))
                error_count += 1
        
        training_errors[lam] = error_count
    
    # Determine which lambda gave the lowest error rates
    min_idx = 0
    min_error = 50000

    for i in range(0,num_iterations):
        if training_errors[i] < min_error:
            min_idx = i
            min_error = training_errors[i]
    

    # Use the selected lambda with the rest of the training data to get w
    lam = lambdas[min_idx]
    print("Lambda Chosen: " + str(lam))

#     w = np.linalg.inv(A.T @ A + lam * np.identity(len(A[0,:]))) @ A.T @ d
    w = ws[min_idx]

    # Find the predictions for the second test set
    y_hat = T2 @ w
    error_count = 0

    # Record the number of errors
    for i in range(0, len(y_hat)):
        if abs(round(y_hat[i,0])) != y2[i]:
#             if i < 5:
#                 print("y = " +str(y2[i]) + ", y_hat = "+str(abs(round(y_hat[i]))))
            error_count += 1
    
#     print("HERE0")
    # Calculate the errors and return them
    error_rate = error_count / len(y2)
#     print("HERE1")
    squared_error = np.linalg.norm(y_hat - y2)**2
#     print("HERE2")

    return ([error_rate, squared_error])

In [19]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)


lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db2_data, db3_data, db4_data, db5_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db1_data[0:5000,:]
T2 = db1_data[5000:,:]
y1 = db1_labels[0:5000]
y2 = db1_labels[5000:]

print("Ridge Regression Iteration 1")
[error_rate1, squared_error1] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

print("Error Rate: " + str(round(error_rate1*100,3)) + ", Sqaured Error: " + str(round(squared_error1,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db3_data, db4_data, db5_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db2_data[0:5000,:]
T2 = db2_data[5000:,:]
y1 = db2_labels[0:5000]
y2 = db2_labels[5000:]

print("Ridge Regression Iteration 2")
[error_rate2, squared_error2] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

print("Error Rate: " + str(round(error_rate2*100,3)) + ", Sqaured Error: " + str(round(squared_error2,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db4_data, db5_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db3_data[0:5000,:]
T2 = db3_data[5000:,:]
y1 = db3_labels[0:5000]
y2 = db3_labels[5000:]

print("Ridge Regression Iteration 3")
[error_rate3, squared_error3] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

print("Error Rate: " + str(round(error_rate3*100,3)) + ", Sqaured Error: " + str(round(squared_error3,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db3_data, db5_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db4_data[0:5000,:]
T2 = db4_data[5000:,:]
y1 = db4_labels[0:5000]
y2 = db4_labels[5000:]

print("Ridge Regression Iteration 4")
[error_rate4, squared_error4] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

print("Error Rate: " + str(round(error_rate4*100,3)) + ", Sqaured Error: " + str(round(squared_error4,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db3_data, db4_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T1 = db5_data[0:5000,:]
T2 = db5_data[5000:,:]
y1 = db5_labels[0:5000]
y2 = db5_labels[5000:]

print("Ridge Regression Iteration 5")
[error_rate5, squared_error5] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

print("Error Rate: " + str(round(error_rate5*100,3)) + ", Sqaured Error: " + str(round(squared_error5,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

lambdas = np.logspace(np.log(15),np.log(22))
A = np.vstack((db1_data, db2_data, db3_data, db4_data, db5_data)) #Training matrix
d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels))).reshape(50000,1) #Known classifiers
T1 = db6_data[0:5000,:]
T2 = db6_data[5000:,:]
y1 = db6_labels[0:5000]
y2 = db6_labels[5000:]

print("Ridge Regression Iteration 6")
[error_rate6, squared_error6] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

print("Error Rate: " + str(round(error_rate6*100,3)) + ", Sqaured Error: " + str(round(squared_error6,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()

Current Time = 20:35:21
Ridge Regression Iteration 1
Lambda Chosen: 830.0165428267144
Error Rate: 89.4, Sqaured Error: 231596041.516

Current Time = 20:37:55


Ridge Regression Iteration 2
Lambda Chosen: 845.0899104376215
Error Rate: 89.84, Sqaured Error: 229674995.151

Current Time = 20:39:45


Ridge Regression Iteration 3
Lambda Chosen: 975.9605520020432
Error Rate: 89.8, Sqaured Error: 235685548.314

Current Time = 20:42:23


Ridge Regression Iteration 4
Lambda Chosen: 510.56401383812477
Error Rate: 88.86, Sqaured Error: 229002457.506

Current Time = 20:44:11


Ridge Regression Iteration 5
Lambda Chosen: 510.56401383812477
Error Rate: 89.0, Sqaured Error: 233462062.894

Current Time = 20:46:04


Ridge Regression Iteration 6
Lambda Chosen: 656.8656610960368
Error Rate: 89.92, Sqaured Error: 231679979.435

Current Time = 20:48:47



In [17]:
lambdas = np.logspace(np.log(15),np.log(22))
print(np.round(lambdas,2))
print(np.log(40))

[ 510.56  519.84  529.28  538.89  548.67  558.64  568.78  579.11  589.63
  600.34  611.24  622.34  633.64  645.15  656.87  668.79  680.94  693.31
  705.9   718.72  731.77  745.06  758.59  772.36  786.39  800.67  815.21
  830.02  845.09  860.44  876.06  891.97  908.17  924.66  941.46  958.55
  975.96  993.68 1011.73 1030.1  1048.81 1067.86 1087.25 1106.99 1127.1
 1147.57 1168.41 1189.63 1211.23 1233.23]
3.6888794541139363


In [13]:
###################################################################################################
#
# K-Nearest Neighbors
#
# def k_nearest_neighbors(A1, A2, A3, A4, A5, d1, d2, d3, d4, d5, T, y, k):
def k_nearest_neighbors(A, d, T, y, k):
#     A = np.vstack((A1, A2, A3, A4, A5)) #Training matrix
#     d = np.vstack((d1, d2, d3, d4, d5)) #Known classifiers
    train_size = len(A[:,0])
    test_size = len(T[:,0])

#     distances = np.empty((test_size, train_size,))
    distances = []

#     train_size = len(A[:,0])
#     test_size = len(T[:,0])

    test_errors = np.zeros(test_size)
    labels = np.zeros(test_size)
    error_count = 0
    
    for i in range(0, test_size):
        for j in range(0, train_size):
            # curr_A = A[i,:]
            # curr_T = T1[j,:]
            # distance = distance_fn(curr_A, curr_T)
            # distances[j].append((distance, i))
#             distances[i,j] = (np.linalg.norm(A[j,:]-T[i,:]),j)
            distances.append((np.linalg.norm(A[j,:]-T[i,:]),j))

#     distances = np.sqrt((T**2).sum(axis=1)[:, np.newaxis] + (A**2).sum(axis=1) - 2 * T.dot(A.T))
    # distances = np.sqrt((T**2).sum(axis=1)[:, np.newaxis] + (self.A**2).sum(axis=1) - 2 * T.dot(self.A.T))


#         sort_distances = []
#         for i in range(0, test_size):
        sort_distances = sorted(distances)

#         k_nearest = []
#         for i in range(0, test_size):
        k_nearest = sort_distances[:k]

        k_labels = []
        for dist, idx in k_nearest:
            k_labels.append(d[idx])

        labels[i] = statistics.mode(k_labels)

        if labels[i] != y[i]:
            error_count += 1
        
        
        
    # Calculate the errors and return them
    error_rate = error_count / len(y)
    squared_error = np.linalg.norm(labels - y)**2

    return ([error_rate, squared_error])


In [None]:
import statistics 

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)


# A = np.vstack((db2_data, db3_data, db4_data, db5_data, db6_data)) #Training matrix
# d = np.column_stack((np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1)
# A = np.vstack((db2_data, db3_data)) #Training matrix
# d = np.column_stack((np.array(db2_labels), np.array(db3_labels))).reshape(20000,1) #Known classifiers
A = db2_data
d = db2_labels
T = db1_data[0:5000,:]
y = db1_labels[0:5000]
k = 1

print("KNN Iteration 1")
[error_rate1, squared_error1] = k_nearest_neighbors(A, d, T, y, k)

print("Error Rate: " + str(round(error_rate1*100,3)) + ", Sqaured Error: " + str(round(squared_error1,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

# lambdas = np.logspace(np.log(15),np.log(22))
# A = np.vstack((db1_data, db3_data, db4_data, db5_data, db6_data)) #Training matrix
# d = np.column_stack((np.array(db1_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
# T1 = db2_data[0:5000,:]
# T2 = db2_data[5000:,:]
# y1 = db2_labels[0:5000]
# y2 = db2_labels[5000:]

# print("KNN Iteration 2")
# [error_rate2, squared_error2] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

# print("Error Rate: " + str(round(error_rate2*100,3)) + ", Sqaured Error: " + str(round(squared_error2,3)))
# print()

# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Current Time =", current_time)
# print()
# print()

# lambdas = np.logspace(np.log(15),np.log(22))
# A = np.vstack((db1_data, db2_data, db4_data, db5_data, db6_data)) #Training matrix
# d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
# T1 = db3_data[0:5000,:]
# T2 = db3_data[5000:,:]
# y1 = db3_labels[0:5000]
# y2 = db3_labels[5000:]

# print("KNN Iteration 3")
# [error_rate3, squared_error3] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

# print("Error Rate: " + str(round(error_rate3*100,3)) + ", Sqaured Error: " + str(round(squared_error3,3)))
# print()

# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Current Time =", current_time)
# print()
# print()

# lambdas = np.logspace(np.log(15),np.log(22))
# A = np.vstack((db1_data, db2_data, db3_data, db5_data, db6_data)) #Training matrix
# d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
# T1 = db4_data[0:5000,:]
# T2 = db4_data[5000:,:]
# y1 = db4_labels[0:5000]
# y2 = db4_labels[5000:]

# print("KNN Iteration 4")
# [error_rate4, squared_error4] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

# print("Error Rate: " + str(round(error_rate4*100,3)) + ", Sqaured Error: " + str(round(squared_error4,3)))
# print()

# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Current Time =", current_time)
# print()
# print()

# lambdas = np.logspace(np.log(15),np.log(22))
# A = np.vstack((db1_data, db2_data, db3_data, db4_data, db6_data)) #Training matrix
# d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
# T1 = db5_data[0:5000,:]
# T2 = db5_data[5000:,:]
# y1 = db5_labels[0:5000]
# y2 = db5_labels[5000:]

# print("KNN Iteration 5")
# [error_rate5, squared_error5] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

# print("Error Rate: " + str(round(error_rate5*100,3)) + ", Sqaured Error: " + str(round(squared_error5,3)))
# print()

# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Current Time =", current_time)
# print()
# print()

# lambdas = np.logspace(np.log(15),np.log(22))
# A = np.vstack((db1_data, db2_data, db3_data, db4_data, db5_data)) #Training matrix
# d = np.column_stack((np.array(db1_labels), np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels))).reshape(50000,1) #Known classifiers
# T1 = db6_data[0:5000,:]
# T2 = db6_data[5000:,:]
# y1 = db6_labels[0:5000]
# y2 = db6_labels[5000:]

# print("KNN Iteration 6")
# [error_rate6, squared_error6] = ridge_regression(A, d, T1, T2, y1, y2, lambdas)

# print("Error Rate: " + str(round(error_rate6*100,3)) + ", Sqaured Error: " + str(round(squared_error6,3)))
# print()

# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Current Time =", current_time)
# print()

Current Time = 22:07:48
KNN Iteration 1


In [6]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# A = db2_data
# d = db2_labels
# T = db1_data[0:5000,:]
# y = db1_labels[0:5000]

A = np.vstack((db5_data, db3_data)) #Training matrix
d = np.column_stack((np.array(db5_labels), np.array(db3_labels))).reshape(20000,1) #Known classifiers
T = db1_data
y = db1_labels

# A = np.vstack((db1_data, db3_data, db4_data, db5_data, db6_data)) #Training matrix
# d = np.column_stack((np.array(db1_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
# T = db2_data
# y = db2_labels

train_size = len(A[:,0])
test_size = len(T[:,0])

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10,algorithm='ball_tree',p=2)
# knn = KNeighborsClassifier(n_neighbors=3,algorithm='kd_tree',p=2)
knn.fit(A, d)
# KNeighborsClassifier(...)


error_count = 0
labels = np.zeros(test_size)

for i in range(0,test_size):
    test = T[i,:].reshape((1,-1))
    y_hat = knn.predict(test)
    labels[i] = y_hat
#     y_hat = knn.predict(T[i,:])
    
    if y_hat != y[i]:
        error_count += 1
    
    
error_rate = error_count / test_size
squared_error = np.linalg.norm(labels - y)**2

# for i in range(0,20):
#     print(labels[i])


print("Error Rate: " + str(round(error_rate*100,3)) + ", Sqaured Error: " + str(round(squared_error,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

Current Time = 01:13:13




Error Rate: 90.23, Sqaured Error: 175417.0

Current Time = 01:20:33


