In [None]:
# Imports 
import numpy as np #Represent ndarrays a.k.a. tensors
import matplotlib.pyplot as plt #For plotting
np.random.seed(0) #For repeatability of the experiment
import pickle #To read data for this experiment
from sklearn.cross_validation import StratifiedKFold

import sys
import os
import time
import string
import random
import _pickle as cPickle

import numpy as np
import theano
import theano.tensor as T
import lasagne
from sklearn.preprocessing import normalize

def unpickle(file):
    
    fo = open(file, 'rb')
    dict = cPickle.load(fo, encoding="latin-1")
    fo.close()
    return dict

def load_data():
    X = []
    y = []
    for p in range(5):
        data = unpickle('cifar-10-batches-py/data_batch_'+str(p+1))
        #print(data.shape)
        X_var = data["data"]
        y_var = data["labels"]
        X.append(X_var)
        y.append(y_var)
    
    X_var = np.concatenate(X)
    y_train = np.concatenate(y).astype(np.int32)
    
    data = unpickle('cifar-10-batches-py/test_batch')
    X_test1 = data["data"]
    y_test = np.array(data["labels"],dtype = np.int32)
    
    X_train = np.reshape(X_var,(X_var.shape[0],-1))
    X_test = np.reshape(X_test1,(X_test1.shape[0],-1))
    
    ## Adding bias - tranform shape
#     X_train = np.hstack([X_train,np.ones((X_train.shape[0],1))])
#     X_test = np.hstack([X_test,np.ones((X_test.shape[0],1))])
    
    return X_train, y_train, X_test, y_test

X_train1, y_train1, X_test1, y_test1 = load_data()

Xtr = normalize(X_train1, axis = 1, norm = "l1")
Xtst = normalize(X_test1, axis = 1, norm = "l1")
ytst = y_test1
ytr = y_train1

# -

In [None]:
# Linear model    
def train_linear_classifier(X_train, y_train, step_size, reg, gd_iters):    
    #Define some local varaibles
    D = X_train.shape[1] #Number of features
    K = max(y_train)+1 #Number of classes assuming class index starts from 0

    # Initialize parameters randomly
    W = 0.01 * np.random.randn(D,K)
    b = np.zeros((1,K))# Initial values from hyperparameter
    
    
    #Perform batch SGD using backprop

    #For simplicity we will take the batch size to be the same as number of examples
    num_examples = X_train.shape[0]

    print("reg param:",reg)
    print("step size:",step_size)
    print("iterations:",gd_iters)
    
    # gradient descent loop
    for i in range(gd_iters):

        # evaluate class scores, [N x K]
        scores = np.dot(X_train, W) + b

        # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]

        # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),y_train])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W)
        loss = data_loss + reg_loss
#         if i % 100 == 0:
#             print("iteration:",i, " loss:",loss)

        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y_train] -= 1
        dscores /= num_examples

        # backpropate the gradient to the parameters (W,b)
        dW = np.dot(X_train.T, dscores)
        db = np.sum(dscores, axis=0, keepdims=True)

        dW += reg*W # regularization gradient

        # perform a parameter update
        W += -step_size * dW
        b += -step_size * db
    return W, b

def test_linear_classifier(X_data, y_data, W, b):
    scores = np.dot(X_data, W) + b
    predicted_class = np.argmax(scores, axis=1)
    test_accuracy = (np.mean(predicted_class == y_data))
    return test_accuracy

In [None]:
# split data into test and train with 80:20 ratio
#Xtr, Xtst, ytr, ytst = test_train_data_split(X,y,0.8)

W_tr, b_tr = train_linear_classifier(Xtr, ytr, 0.8, 0.4, 100)

# testing the performance on hold out set
print("Accuracy")
train_accuracy = test_linear_classifier(Xtr, ytr, W_tr, b_tr)
print("train accuracy: ",train_accuracy,"\n")
test_accuracy = test_linear_classifier(Xtst, ytst, W_tr, b_tr)
print("test accuracy: ",test_accuracy,"\n")
#plotting model result
#plot_result(Xtst, ytst, W_tr, b_tr)

#using several different values for the hyper parameter:
step_size_values = list(np.arange(0,100,10))  #Also called learning rate
# step_size_values = np.append(step_size_values, list(np.arange(1,10,1)))
# step_size_values = np.append(step_size_values, list(np.arange(10,100,5)))

test_results = [0] * len(step_size_values)
print("Changing step size")
for i, step_size in enumerate(step_size_values):
    W_tr, b_tr = train_linear_classifier(Xtr, ytr, step_size, 0.4, 100)
    test_result = test_linear_classifier(Xtst, ytst, W_tr, b_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(step_size_values,test_results,'-')
plt.axis([0, max(step_size_values)+1, 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('learning rate')
plt.show()

## changing reg parameter
reg_values = list([0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1])

test_results = [0] * len(reg_values)
print("Changing Reg parameter")
for i, reg_value in enumerate(reg_values):
    W_tr, b_tr = train_linear_classifier(Xtr, ytr, 1, reg_value, 100)
    test_result = test_linear_classifier(Xtst, ytst, W_tr, b_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(reg_values,test_results,'-')
plt.axis([0, max(reg_values), 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('regularization value')
plt.show()

# modifying the number of gradient descent iterations
# reg = 0.4, step_size = 1
gd_iters_vals = list([100, 500, 1000])

test_results = [0] * len(gd_iters_vals)
print("Changing no of iterations")
for i, gd_iters in enumerate(gd_iters_vals):
    W_tr, b_tr = train_linear_classifier(Xtr, ytr, 1, 0.4, gd_iters)
    test_result = test_linear_classifier(Xtst, ytst, W_tr, b_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(gd_iters_vals,test_results,'-')
plt.axis([0, max(gd_iters_vals), 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('no of iterations')
plt.show()

In [None]:
### CIFAR 10 on FFN RELU

In [None]:
# FFN model    
def train_ffn_classifier(X_train, y_train, step_size, reg, gd_iters):    
    h = 100 ## hidden layers
    K = max(y_train)+1   ## no of O/P classes --> considerin class starts from 0 we add 1
    D = X_train.shape[1] ## no of I/P features
    W2 = 0.01*np.random.randn(h,K)    
    b2 = np.zeros((1,K)) 
    W = 0.01*np.random.randn(D,h)
    b = np.zeros((1,h))

    #For simplicity we will take the batch size to be the same as number of examples
    num_examples = X_train.shape[0]
    
    print("reg param:",reg)
    print("step size:",step_size)
    print("iterations:",gd_iters)
    
    for i in range(gd_iters):
        hidden_layer = np.maximum(0, np.dot(X_train, W) + b) 
        scores = np.dot(hidden_layer, W2) + b2
        #print(hidden_layer.shape)

        exp_scores = np.exp(scores)
        probs = exp_scores/np.sum(exp_scores, axis = 1, keepdims = True)

        # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),y_train])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        #if i % 1000 == 0:
            #print "iteration %d: loss %f" % (i, loss)

        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y_train] -= 1
        dscores /= num_examples

        # backpropate the gradient to the parameters
        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis = 0, keepdims = True)       

        # next backprop into hidden layer
        dhidden = np.dot(dscores, W2.T)
        dhidden[hidden_layer <= 0] =0   

        ## final backprop
        dW = np.dot(X_train.T, dhidden)
        db = np.sum(dhidden, axis = 0, keepdims = True)
        
        ## adding reg to gradient
        dW2 += reg*W2
        dW += reg*W

        ## stepsize
        W += -step_size * dW
        W2 += -step_size * dW2
        b2 += -step_size * db2
        b += -step_size * db
    return W, b, W2, b2

def test_ffn_classifier(X_data, y_data, W, b, W2, b2):
    # Post-training: evaluate model accuracy
    hidden_layer = np.maximum(0, np.dot(X_data, W) + b) 
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis = 1) ### scores with max conf
    test_accuracy = (np.mean(predicted_class == y_data))
    return test_accuracy


In [None]:
# split data into test and train with 80:20 ratio
# Xtr, Xtst, ytr, ytst = test_train_data_split(X,y,0.8)

W_tr, b_tr, W2_tr, b2_tr = train_ffn_classifier(Xtr, ytr, 0.4, 0.001, 100)

# testing the performance on hold out set
train_accuracy = test_ffn_classifier(Xtr, ytr, W_tr, b_tr, W2_tr, b2_tr)
print("train accuracy: ",train_accuracy,)
test_accuracy = test_ffn_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
print("test accuracy: ",test_accuracy,"\n")

#plotting model result
# plot_result(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)

#using several different values for the hyper parameter:
step_size_values = list(np.arange(0,100,10))  #Also called learning rate
# step_size_values = np.append(step_size_values, list(np.arange(1,10,1)))
# step_size_values = np.append(step_size_values, list(np.arange(10,100,5)))

test_results = [0] * len(step_size_values)
print("Changing step size")
for i, step_size in enumerate(step_size_values):
    W_tr, b_tr, W2_tr, b2_tr = train_ffn_classifier(Xtr, ytr, step_size, 0.001, 100)
    test_result = test_ffn_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(step_size_values,test_results,'-')
plt.axis([0, max(step_size_values)+1, 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('learning rate')
plt.show()

# best value for regularization
# we keep the learning rate to be constant
# and alter the reg parameter
reg_values = list([0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1])

test_results = [0] * len(reg_values)
print("Changing reg parameter")
for i, reg_value in enumerate(reg_values):
    W_tr, b_tr, W2_tr, b2_tr = train_ffn_classifier(Xtr, ytr, 0.4, reg_value, 100)
    test_result = test_ffn_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(reg_values,test_results,'-')
plt.axis([0, max(reg_values), 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('regularization value')
plt.show()

# modifying the number of gradient descent iterations
# reg = 0.4, step_size = 1
gd_iters_vals = list([100, 500, 1000])

test_results = [0] * len(gd_iters_vals)
print("Changing no of iterations")
for i, gd_iters in enumerate(gd_iters_vals):
    W_tr, b_tr, W2_tr, b2_tr = train_ffn_classifier(Xtr, ytr, 0.4, 0.001, gd_iters)
    test_result = test_ffn_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(gd_iters_vals,test_results,'-')
plt.axis([0, max(gd_iters_vals), 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('no of iterations')
plt.show()

In [None]:
##CIFAR 10 FFN Leaky RELU

In [None]:
def train_ffn_leakyrelu_classifier(X_train, y_train, step_size, reg, gd_iters):
    h = 100 ## hidden layers
    K = max(y_train)+1   ## no of O/P classes --> considerin class starts from 0 we add 1
    D = X_train.shape[1] ## no of I/P features
    W2 = 0.01*np.random.randn(h,K)  ## hidden layers *classes
    b2 = np.zeros((1,K)) 
    W = 0.01*np.random.randn(D,h)
    b = np.zeros((1,h))

    #For simplicity we will take the batch size to be the same as number of examples
    num_examples = X_train.shape[0]

    print("reg param:",reg)
    print("step size:",step_size)
    print("iterations:",gd_iters)
    
    for i in range(gd_iters):
        hidden_layer = np.maximum(np.dot(X_train, W) + b, 0.01*(np.dot(X_train, W) + b))
        scores = np.dot(hidden_layer, W2) + b2

        exp_scores = np.exp(scores)
        probs = exp_scores/np.sum(exp_scores, axis = 1, keepdims = True)

        # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),y_train])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        #if i % 1000 == 0:
            #print "iteration %d: loss %f" % (i, loss)

        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y_train] -= 1
        dscores /= num_examples

        # backpropate the gradient to the parameters
        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis = 0, keepdims = True)

        # next backprop into hidden layer
        dhidden = np.dot(dscores, W2.T)
        var = 0.01*np.dot(dscores, W2.T)
        
#         for j in range(X_train.shape[0]):
#             for k in range(X_train.shape[1]):
#                 if dhidden[j,k] < 0:
#                     dhidden[j,k] = var[j,k]
        dhidden[hidden_layer <= 0] = 0.01*dhidden[hidden_layer <= 0]

        ## final backprop
        dW = np.dot(X_train.T, dhidden)
        db = np.sum(dhidden, axis = 0, keepdims = True)

        ## adding reg to gradient
        dW2 += reg*W2
        dW += reg*W

        ## stepsize
        W += -step_size * dW
        W2 += -step_size * dW2
        b2 += -step_size * db2
        b += -step_size * db
    return W, b, W2, b2

def test_ffn_leakyrelu_classifier(X_data, y_data, W, b, W2, b2):
    # Post-training: evaluate model accuracy
    hidden_layer = np.maximum(0.01*np.dot(X_data, W) + b, np.dot(X_data, W) + b) 
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis = 1) ### scores with max conf
    test_accuracy = (np.mean(predicted_class == y_data))
    return test_accuracy


In [None]:
# split data into test and train with 80:20 ratio
# Xtr, Xtst, ytr, ytst = test_train_data_split(X,y,0.8)

# classifier trained using 5-fold cross validation
W_tr, b_tr, W2_tr, b2_tr = train_ffn_leakyrelu_classifier(Xtr, ytr, 0.4, 0.001, 100)

# testing the performance on hold out set
train_accuracy = test_ffn_leakyrelu_classifier(Xtr, ytr, W_tr, b_tr, W2_tr, b2_tr)
print("train accuracy: ",train_accuracy)
test_accuracy = test_ffn_leakyrelu_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
print("test accuracy: ",test_accuracy,"\n")

#plotting model result
# plot_result(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)

#using several different values for the hyper parameter:
step_size_values = list(np.arange(0,100,10))  #Also called learning rate
# step_size_values = np.append(step_size_values, list(np.arange(1,10,1)))
# step_size_values = np.append(step_size_values, list(np.arange(10,100,5)))

test_results = [0] * len(step_size_values)
print("Changing step size")
for i, step_size in enumerate(step_size_values):
    W_tr, b_tr, W2_tr, b2_tr = train_ffn_leakyrelu_classifier(Xtr, ytr, step_size, 0.001, 100)
    test_result = test_ffn_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(step_size_values,test_results,'-')
plt.axis([0, max(step_size_values)+1, 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('learning rate')
plt.show()

# best value for regularization
# we keep the learning rate to be constant
# and alter the reg parameter
reg_values = list([0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1])

test_results = [0] * len(reg_values)
print("Changing reg parameter")
for i, reg_value in enumerate(reg_values):
    W_tr, b_tr, W2_tr, b2_tr = train_ffn_leakyrelu_classifier(Xtr, ytr, 0.4, reg_value, 100)
    test_result = test_ffn_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(reg_values,test_results,'-')
plt.axis([0, max(reg_values), 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('regularization value')
plt.show()

# modifying the number of gradient descent iterations
# reg = 0.4, step_size = 1
# gd_iters_vals = list([100, 500, 1000])

# test_results = [0] * len(gd_iters_vals)
# print("Changing no of iterations")
# for i, gd_iters in enumerate(gd_iters_vals):
#     W_tr, b_tr, W2_tr, b2_tr = train_ffn_leakyrelu_classifier(Xtr, ytr, 0.4, 0.8, gd_iters)
#     test_result = test_ffn_classifier(Xtst, ytst, W_tr, b_tr, W2_tr, b2_tr)
#     print("test accuracy: ",test_result,"\n")
#     test_results[i] = test_result

# plt.plot(gd_iters_vals,test_results,'-')
# plt.axis([0, max(gd_iters_vals), 0, 1])
# plt.ylabel('test accuracy')
# plt.xlabel('no of iterations')
# plt.show()

In [None]:
##CIFAR 10 FFN Maxout

In [None]:
# cross validation scheme
def cross_validate_maxout_classifier(Xtr, ytr, num_of_folds, step_size, reg, gd_iters):
    # create stratified k folds of dataset for cross validation
    skf = StratifiedKFold(ytr, n_folds=num_of_folds,random_state=0)
    # store predicted accuracies of each fold
    CV_pred_accuracies = []

    for train_index, valid_index in skf:
        X_train, X_valid = Xtr[train_index], Xtr[valid_index]
        y_train, y_valid = ytr[train_index], ytr[valid_index]
        W1, b1, W2, b2, W3, b3 = train_ffn_maxout_classifier(X_train, y_train, step_size, reg, gd_iters)
        test_accuracy = test_ffn_maxout_classifier(X_valid, y_valid, W1, b1, W2, b2, W3, b3)
        list.append(CV_pred_accuracies,test_accuracy)
        print("cumulative CV accuracy: ", np.mean(CV_pred_accuracies),"\n")
    return W1, b1, W2, b2, W3, b3

# FFN model    
def train_ffn_maxout_classifier(X_train, y_train, step_size, reg, gd_iters):
    h = 100 ## hidden layers
    K = max(y_train)+1   ## no of O/P classes --> considerin class starts from 0 we add 1
    D = X_train.shape[1] ## no of I/P features
    W3 = 0.01*np.random.randn(h,K)  ## hidden layers *classes
    b3 = np.zeros((1,K)) 
    W1 = 0.01*np.random.randn(D,h)
    b1 = np.zeros((1,h))
    W2 = 0.01*np.random.randn(D,h)
    b2 = np.zeros((1,h))

    #For simplicity we will take the batch size to be the same as number of examples
    num_examples = X_train.shape[0]
    
    print("reg param:",reg)
    print("step size:",step_size)
    print("iterations:",gd_iters)
    
    for i in range(gd_iters):
        hidden_layer1 = np.dot(X_train, W1) + b1
        hidden_layer2 = np.dot(X_train, W2) + b2
        hidden_layer = np.maximum(hidden_layer1, hidden_layer2)
        scores = np.dot(hidden_layer, W3) + b3
#         scores2 = np.dot(hidden_layer2, W3) + b3

        exp_scores = np.exp(scores)
        probs = exp_scores/np.sum(exp_scores, axis = 1, keepdims = True)
        
#         exp_scores2 = np.exp(scores2)
#         probs2 = exp_scores2/np.sum(exp_scores2, axis = 1, keepdims = True)        
        
#         scores = scores1 + scores2
#         exp_scores = np.exp(scores)
#         probs = exp_scores/np.sum(exp_scores, axis = 1, keepdims = True)
        
        
        # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),y_train])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.33*reg*np.sum(W1*W1) + 0.33*reg*np.sum(W2*W2) + 0.34*reg*np.sum(W3*W3)
        loss = data_loss + reg_loss
        #if i % 1000 == 0:
            #print "iteration %d: loss %f" % (i, loss)

        # compute the gradient on scores
#         dscores1 = probs1
#         dscores1[range(num_examples),y] -= 1
#         dscores1 /= num_examples
        
#         dscores2 = probs2
#         dscores2[range(num_examples),y] -= 1
#         dscores2 /= num_examples       
        
        dscores = probs
        dscores[range(num_examples),y_train] -= 1
        dscores /= num_examples

        # backpropate the gradient to the parameters
        dW3 = np.dot(hidden_layer.T, dscores)
        db3 = np.sum(dscores, axis = 0, keepdims = True)

        # next backprop into hidden layer
        dhidden1 = np.dot(dscores, W3.T)
        dhidden1[hidden_layer1 <= hidden_layer2] = 0
        dhidden2 = np.dot(dscores, W3.T)
        dhidden2[hidden_layer2 <= hidden_layer1] = 0

#         for j in range(hidden_layer2.shape[0]):
#             for k in range(hidden_layer2.shape[1]):
#                 if hidden_layer1[j,k]<hidden_layer2[j,k]:
#                     dhidden1[j,k] = 0
#                 elif hidden_layer2[j,k]<=hidden_layer1[j,k]:
#                     dhidden2[j,k] = 0
        
        ## final backprop
        dW1 = np.dot(X_train.T, dhidden1)
        db1 = np.sum(dhidden1, axis = 0, keepdims = True)

        dW2 = np.dot(X_train.T, dhidden2)
        db2 = np.sum(dhidden2, axis = 0, keepdims = True)
        ## adding reg to gradient
        dW2 += reg*W2
        dW1 += reg*W1
        dW3 += reg*W3

        ## stepsize
        W1 += -step_size * dW1
        W2 += -step_size * dW2
        W3 += -step_size * dW3
        
        b2 += -step_size * db2
        b1 += -step_size * db1
        b3 += -step_size * db3
    return W1, b1, W2, b2, W3, b3

def test_ffn_maxout_classifier(X_data, y_data, W1, b1, W2, b2, W3, b3):
    # Post-training: evaluate model accuracy
    hidden_layer = np.maximum(np.dot(X_data, W1) + b1, np.dot(X_data, W2) + b2) 
    scores = np.dot(hidden_layer, W3) + b3
    predicted_class = np.argmax(scores, axis = 1) ### scores with max conf
    test_accuracy = (np.mean(predicted_class == y_data))
    return test_accuracy


In [None]:
# split data into test and train with 80:20 ratio
# Xtr, Xtst, ytr, ytst = test_train_data_split(X, y, 0.8)

# classifier trained using 5-fold cross validation
W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr = train_ffn_maxout_classifier(Xtr, ytr, 0.4, 0.001, 100)

# testing the performance on hold out set
train_accuracy = test_ffn_maxout_classifier(Xtr, ytr, W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr)
print("train accuracy: ",train_accuracy)
test_accuracy = test_ffn_maxout_classifier(Xtst, ytst, W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr)
print("test accuracy: ",test_accuracy,"\n")

#plotting model result
# plot_result_for_maxout(Xtst, ytst, W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr)

#using several different values for the hyper parameter:
step_size_values = list(np.arange(0,100,10))  #Also called learning rate
# step_size_values = np.append(step_size_values, list(np.arange(1,10,1)))
# step_size_values = np.append(step_size_values, list(np.arange(10,100,5)))

test_results = [0] * len(step_size_values)

for i, step_size in enumerate(step_size_values):
    W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr = train_ffn_maxout_classifier(Xtr, ytr, step_size, 0.001, 100)
    test_result = test_ffn_maxout_classifier(Xtst, ytst, W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(step_size_values,test_results,'-')
plt.axis([0, max(step_size_values)+1, 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('learning rate')
plt.show()

# best value for regularization
# we keep the learning rate to be constant
# and alter the reg parameter
reg_values = list([0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1])

test_results = [0] * len(reg_values)

for i, reg_value in enumerate(reg_values):
    W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr = train_ffn_maxout_classifier(Xtr, ytr, 0.4, reg_value, 100)
    test_result = test_ffn_maxout_classifier(Xtst, ytst, W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr)
    print("test accuracy: ",test_result,"\n")
    test_results[i] = test_result

plt.plot(reg_values,test_results,'-')
plt.axis([0, max(reg_values), 0, 1])
plt.ylabel('test accuracy')
plt.xlabel('regularization value')
plt.show()

# modifying the number of gradient descent iterations
# reg = 0.4, step_size = 1
# gd_iters_vals = list([100, 500, 1000])

# test_results = [0] * len(gd_iters_vals)

# for i, gd_iters in enumerate(gd_iters_vals):
#     W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr = train_ffn_maxout_classifier(Xtr, ytr, 0.4, 0.8, gd_iters)
#     test_result = test_ffn_maxout_classifier(Xtst, ytst, W1_tr, b1_tr, W2_tr, b2_tr, W3_tr, b3_tr)
#     print("test accuracy: ",test_result,"\n")
#     test_results[i] = test_result

# plt.plot(gd_iters_vals,test_results,'-')
# plt.axis([0, max(gd_iters_vals), 0, 1])
# plt.ylabel('test accuracy')
# plt.xlabel('no of iterations')
# plt.show()
