In [27]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import math
from tqdm import tqdm

In [28]:
train_data  = pd.read_csv("./Alphabets/train.csv",header=None).to_numpy()
test_data = pd.read_csv("./Alphabets/test.csv",header=None).to_numpy()

In [29]:
train_x = train_data[:,:-1]/255
train_y = train_data[:,-1]
test_x = test_data[:,:-1]/255
test_y = test_data[:,-1]

# Neural Network Class
## PART A

In [30]:
"""Training algorithms for deep learning models are usually iterative in nature and thus 
require the user to specify some initial point from which to begin the iterations. 
Moreover, training deep models is a sufficiently difficult task that most algorithms are
strongly affected by the choice of initialization. 
above quote is not mine, I read it online while trying to understand the initialization part"""

class neuralNetwork:
    def __init__(self,batchSize,input_features,architecture,target_class,eta,max_iter,activationMode,learningRate):
        self.batchSize = batchSize
        self.input_features = input_features
        self.architecture = architecture
        self.target_class = target_class
        self.learningRate = learningRate
        self.activationMode  = activationMode

        self.parameter = []
        self.layer_input = [0]*(len(architecture)+1)
        self.layer_output = [0]*(len(architecture)+1)
        self.layer_delta = [0]*(len(architecture)+1)
        self.total_layers = len(architecture)+1

        self.eta = eta
        self.max_iter = max_iter

        

    def activation(self,x,mode):
        if mode=="sigmoid":
            return 1/(1+np.exp(-x))
        elif mode=="relu":
            return np.where(x<0,0,x)
        
        
        
    def differentiation(self,op,mode):
        if mode=="sigmoid":
            return op*(1-op)
        elif mode=="relu":
            return 1 * (op > 0)
        
        
        
    def oneHotEncoding(self,y): #y is **list** of ouput label 0<=y<26
        """ converts y label to vector representation called One Hot Encoding """
        a = np.array(y)
        b = np.zeros((a.size, self.target_class))
        b[np.arange(a.size),a] = 1
        return b

    

    def initalize_parameters(self):

        total_layers_architecture = self.architecture  + [self.target_class]
        layerInputSize = self.input_features
        np.random.seed(0)
        
        """Random intialization is used to preserve the stochastic nature of neural networks"""
        
        for layer, total_neurons in enumerate(total_layers_architecture):
            np.random.seed(layer)
            if layer ==0:
                neurons_in_prev = self.input_features
            else:
                neurons_in_prev = total_layers_architecture[layer-1]
                 
            
            layerOutputSize = total_neurons
            layer_weight = np.random.randn(layerOutputSize, layerInputSize)/math.sqrt(neurons_in_prev)
            """It is important to note that the bias weight in each neuron
            is set to zero by default, not a small random value."""
            layer_bias  = np.zeros((layerOutputSize,1))

#             layer_weight = np.random.uniform(low=-0.025, high=0.025, size=(layerOutputSize, layerInputSize))
#             layer_bias  = np.zeros((layerOutputSize,1))
            
            self.parameter.append([layer_weight,layer_bias])

            layerInputSize = layerOutputSize

        


    def full_feedForward(self,X):
        current_input = X.copy()

        for layer in range(self.total_layers):

            current_parameter = self.parameter[layer]

            weight = current_parameter[0]
            bias = current_parameter[1]
            netJ = np.dot(current_input,weight.T) + bias.T
            
            if self.activationMode == "relu":
                if layer == self.total_layers-1:
                    G_netJ  = self.activation(netJ,"sigmoid")   # only output layer 
                else :
                    G_netJ  = self.activation(netJ,"relu")   # all hidden layer
            
            elif self.activationMode == "sigmoid":
                G_netJ  = self.activation(netJ,"sigmoid") # all layers 
                    

            self.layer_output[layer] = G_netJ
            self.layer_input[layer] = current_input

            current_input = G_netJ.copy()
        """         last single layer output is the output of entire neural network
                     to be used for calculating new loss function value                   """




    def full_backpropagation(self,Y):

        """  start with last (ouyput)layer whose deltaJ is calculated
             differently then rest of hidden layer                    """
        lastlayer = self.total_layers-1
        op = self.layer_output[lastlayer]
        
        diff_op = self.differentiation(op,"sigmoid")  # in every case output layer is sigmoid
        
        deltaJ_lastlayer = (Y-op)*diff_op/(Y.shape[0])
        self.layer_delta[lastlayer] = deltaJ_lastlayer

        deltaJ_prev = deltaJ_lastlayer.copy()

        #reverse iteration
        for layer in range(self.total_layers-1,0,-1):
            theta_downNBR  = self.parameter[layer][0] # weight without bias

            oj = self.layer_output[layer-1]
            
            if self.activationMode =="relu":
                diff_oj = self.differentiation(oj,"relu")
            elif self.activationMode=="sigmoid":
                diff_oj = self.differentiation(oj,"sigmoid")
                
            deltaJ_curr = np.dot(deltaJ_prev, theta_downNBR)*diff_oj

            self.layer_delta[layer-1] = deltaJ_curr
            deltaJ_prev = deltaJ_curr.copy()
            
            

    def costFunction(self,y):
        final_op = self.layer_output[self.total_layers-1]
        return (np.sum((y-final_op)**2))/(2*y.shape[0])



    def updateParameters(self,epochCount):
        ETA = self.eta   #defalut for fault tolerance
        
        if self.learningRate == "normal":
            ETA = self.eta
        elif self.learningRate ==  "adaptive":
            ETA = self.eta/math.sqrt(epochCount) # as per question requirement 
            
        for i in range(len(self.architecture)+1):
            
            gradient_W = np.dot(self.layer_delta[i].T, self.layer_input[i])
            gradient_B = np.sum(self.layer_delta[i],axis = 0).T.reshape((-1,1))
            self.parameter[i][0] = self.parameter[i][0] + (ETA)*gradient_W
            self.parameter[i][1] = self.parameter[i][1] + (ETA)*gradient_B

            
            

    def fit(self,x,y):
        
        self.initalize_parameters()
        
        indexes = np.arange(x.shape[0])
        Y = self.oneHotEncoding(y)
        
        cost, newcost =0, 0
        improvement, old_improvement = math.inf , 0
        n_iter_no_change = 3
        i = 0
        epochNumber = 0
        totalBatches = math.ceil(x.shape[0]/self.batchSize)
        while(True):
            
            i+=1

            """shuffle the data after every_epoch to maintain stochastic nature(random) of the newtork"""
            np.random.shuffle(indexes)

            epochNumber = i
            
            for j in range(0,x.shape[0],self.batchSize):
                
                batch = indexes[j:j + self.batchSize]
                
                x_batch = x[batch]
                y_batch = Y[batch]

                self.full_feedForward(x_batch)
               
                self.full_backpropagation(y_batch)

                self.updateParameters(epochNumber)
                
                cost += self.costFunction(y_batch)

            oldcost = newcost
            newcost = cost/totalBatches
            cost = 0
            
            old_improvement = improvement
            improvement  = abs(oldcost - newcost)
            
            if improvement < 1e-5:
                n_iter_no_change-=1
                if n_iter_no_change ==0:
                    print("convergence reached with total epoch :",i)
                    return i
                    break
            else:
                n_iter_no_change = 3
                

            if i == self.max_iter:
                print("max_iter reached")
                return i
                break   
                
            if i%100 ==0:
                print(improvement)
                print('Current Epoch is : ',i)
                
                

    def score(self,x,y):
        self.full_feedForward(x)
        final_op = self.layer_output[self.total_layers-1]
        return np.count_nonzero((np.argmax(final_op,axis =1) == y) == True)/y.shape[0] #,np.argmax(final_op,axis =1)


# PART B

In [56]:
def plot_b():
    arch  = [1,5,10,50,100]
    train_acc_b = []
    test_acc_b = []
    time_list_b = []
    epoch_b = []
    for i in tqdm(arch):
        startTime = time.time()
        model_b = neuralNetwork(100,784,[i],26,0.1,3000,"sigmoid","normal")
        epoch_b.append(model_b.fit(train_x,train_y))
        time_list_b.append(time.time()-startTime)
        train_acc_b.append(model_b.score(train_x,train_y))
        test_acc_b.append(model_b.score(test_x,test_y))




    labels = ['1', '5', '10', '50', '100']
    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)
    acc_b1 = ax1.plot(x, np.array(test_acc_b)*100,color = "blue")
    acc_b2 = ax1.plot(x, np.array(train_acc_b)*100,color = "#ff7f0e")
    acc_b3 = ax1.bar(x - width/2, np.array(train_acc_b)*100, width, label='train accuracy',color = "#ff7f0e")
    acc_b4 = ax1.bar(x + width/2, np.array(test_acc_b)*100, width, label='test accuracy',color = "blue")
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel("Neurons in single hidden layer")
    ax1.set_title('Sigmoid and normal learningMode')
    ax1.set_xticks(x)
    ax1.set_xticklabels(labels)
    ax1.legend()

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(111)
    t = ax2.bar(x, np.array(time_list_b)/60, width, label='Time: minutes',color = "blue")
    ax2.set_ylabel('Time taken to converge')
    ax2.set_xlabel("Neurons in single hidden layer")
    ax2.set_title('Sigmoid and normal learningMode')
    ax2.set_xticks(x)
    ax2.set_xticklabels(labels)
    ax2.legend()

    fig2_2 = plt.figure()
    ax2_2 = fig2_2.add_subplot(111)
    epo = ax2_2.bar(x, epoch_b , width, label='epochs',color="blue")
    ax2_2.set_ylabel('Epoch/iteration to reach convergence')
    ax2_2.set_xlabel("Neurons in single hidden layer")
    ax2_2.set_title('Sigmoid and normal learningMode')
    ax2_2.set_xticks(x)
    ax2_2.set_xticklabels(labels)
    ax2_2.legend()


    plt.show()
    return train_acc_b, test_acc_b, time_list_b, epoch_b, acc_b1, acc_b2, acc_b3,acc_b4,t,epo

In [1]:
plot_b()

# PART C

In [None]:
def plot_c():
    arch  = [1,5,10,50,100]
    train_acc_c = []
    test_acc_c = []
    time_list_c = []
    epoch_c = []
    for i in tqdm(arch):
        startTime = time.time()
        model_c = neuralNetwork(100,784,[i],26,0.5,3000,"sigmoid","adaptive")
        epoch_c.append(model_c.fit(train_x,train_y))
        time_list_c.append(time.time()-startTime)
        train_acc_c.append(model_c.score(train_x,train_y))
        test_acc_c.append(model_c.score(test_x,test_y))




    labels = ['1', '5', '10', '50', '100']
    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig3 = plt.figure()
    ax3 = fig3.add_subplot(111)
    acc_c1 = ax3.plot(x, np.array(train_acc_c)*100,color = "#ff7f0e")
    acc_c2 = ax3.plot(x, np.array(test_acc_c)*100,color = "blue")
    acc_c3 = ax3.bar(x - width/2, np.array(train_acc_c)*100, width, label='train accuracy')
    acc_c4 = ax3.bar(x + width/2, np.array(test_acc_c)*100, width, label='test accuracy')
    ax3.set_ylabel('Accuracy')
    ax3.set_xlabel("Neurons in single hidden layer")
    ax3.set_title('Sigmoid and adaptive learningMode')
    ax3.set_xticks(x)
    ax3.set_xticklabels(labels)
    ax3.legend()


    fig4 = plt.figure()
    ax4 = fig4.add_subplot(111)
    t = ax4.bar(x, np.array(time_list_c)/60, width, label='Time: minutes')
    ax4.set_ylabel('Time taken to converge')
    ax4.set_xlabel("Neurons in single hidden layer")
    ax4.set_title('Sigmoid and adaptive learningMode')
    ax4.set_xticks(x)
    ax4.set_xticklabels(labels)
    ax4.legend()

    fig4_2 = plt.figure()
    ax4_2 = fig4_2.add_subplot(111)
    epo = ax4_2.bar(x, epoch_c , width, label='epochs')
    ax4_2.set_ylabel('Epoch/iteration to reach convergence')
    ax4_2.set_xlabel("Neurons in single hidden layer")
    ax4_2.set_title('Sigmoid and adaptive learningMode')
    ax4_2.set_xticks(x)
    ax4_2.set_xticklabels(labels)
    ax4_2.legend()


    plt.show()
    return train_acc_c, test_acc_c, time_list_c, epoch_c, acc_c1, acc_c2, acc_c3, acc_c4, t, epo

In [None]:
plot_c()

# PART D

In [52]:
def part_d():
    print("-----------------Part D started------------------------------")
    model_d_sigmoid = neuralNetwork(100,784,[100,100],26,0.5,3000,"sigmoid","adaptive")
    
    s = time.time()
    epoch_d_sigmoid = model_d_sigmoid.fit(train_x, train_y)
    print("time taken for sigmoid is :", time.ime()-s)
    
    train_acc_d_sigmoid = model_d_sigmoid.score(train_x, train_y)
    test_acc_d_sigmoid = model_d_sigmoid.score(test_x,test_y)
    print("train accuray is : sigmoid ",train_acc_d_sigmoid)
    print("test accuray is : sigmid  ",test_acc_d_sigmoid)

    model_d_relu = neuralNetwork(100,784,[100,100],26,0.5,3000,"relu","adaptive")

    s = time.time()
    epoch_d_relu = model_d_relu.fit(train_x, train_y)
    print("time taken for relu is :", time.ime()-s)
    
    train_acc_d_relu = model_d_relu.score(train_x, train_y)
    test_acc_d_relu = model_d_relu.score(test_x,test_y)
    print("train accuray is : Relu ",train_acc_d_relu)
    print("test accuray is : Relu ",test_acc_d_relu)

In [None]:
part_d()

# PART E

In [53]:
print("-----------------Part E started------------------------------")
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

def oneHotEncoding(y,targetClass): #y is **list** of ouput label 0<=y<26
    """ converts y label to vector representation called One Hot Encoding """
    a = np.array(y)
    b = np.zeros((a.size,targetClass ))
    b[np.arange(a.size),a] = 1
    return b

-----------------Part E started------------------------------


In [54]:
def part_e():
    Y = oneHotEncoding(train_y,26)

    model_e = MLPClassifier(activation="relu",hidden_layer_sizes=(100,100,),solver='sgd',alpha= 0.0,
                            batch_size=100, learning_rate='invscaling',learning_rate_init=0.5,max_iter=2000,
                            random_state = 0, momentum = 0.0,verbose=False)
    model_e.fit(train_x,Y)

    prob_train = model_e.predict_proba(train_x)
    prob_test = model_e.predict_proba(test_x)

    print(accuracy_score(train_y,prob_train.argmax(axis = 1))*100)
    print(accuracy_score(test_y,prob_test.argmax(axis = 1))*100)

In [55]:
part_e()

87.38461538461539
84.39999999999999


In [None]:
# when i thought we need to train 26 models for each class as binary classification 

# models = []
# for i in tqdm(range(26)):
#     model = MLPClassifier(hidden_layer_sizes=(100,100,), activation='relu', solver='lbfgs', 
#                       alpha=0.0001, batch_size=100, learning_rate='adaptive', 
#                       learning_rate_init=0.5, power_t=0.5, max_iter=200, 
#                       shuffle=True, random_state=None, tol=0.0001, verbose=False, 
#                       warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
#                       validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, 
#                       n_iter_no_change=10, max_fun=15000)
#     model.fit(train_x,Y[:,i])
#     models.append(model)
    
# predictions_prob_train = []
# predictions_prob_test = []
# for i in range(26):
#     model  = models[i]
#     predictions_prob_train.append(model.predict_proba(train_x))
#     predictions_prob_test.append(model.predict_proba(test_x))

    
# def prediction(x,predictions_prob)
#     prediction= []
#     for j in range(x.shape[0]):
#         prob = 0
#         Class = -1
#         for i in range(26):

#             if prob < predictions_prob[i][j][1]:
#                 Class = i
#                 prob = predictions_prob[i][j][1]
#         prediction.append(Class)
#     return prediction


# from sklearn.metrics import accuracy_score

# accuracy_score(test_y,prediction(test_x,predictions_prob_test))*100
# accuracy_score(train_y,prediction(train_x,predictions_prob_train))*100