# Assign7: RNN

In [81]:
import numpy as np
import pandas as pd
import warnings
import copy
import random
from scipy.special import softmax
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import time
warnings.filterwarnings("ignore")

### Reading in training data

In [2]:
f = open("./Train_Arabic_Digit.txt", "r")

In [3]:
dataset = []
point = []
for line in f:
    if line.isspace() == True and len(point) > 0:
        dataset.append(copy.deepcopy(point))
        point = []
    else:
        if line.isspace() == False:
            line_list = line.strip().split()
            line_float = [ float(x) for x in line_list ]
            point.append(line_float)
dataset.append(copy.deepcopy(point))

In [4]:
f.close()

In [5]:
len(dataset)

6600

### Reading in testing data

In [6]:
f = open("./Test_Arabic_Digit.txt", "r")

In [7]:
testset = []
point = []
for line in f:
    if line.isspace() == True and len(point) > 0:
        testset.append(copy.deepcopy(point))
        point = []
    else:
        if line.isspace() == False:
            line_list = line.strip().split()
            line_float = [ float(x) for x in line_list ]
            point.append(line_float)
testset.append(copy.deepcopy(point))

In [8]:
f.close()

In [9]:
len(testset)

2200

### Making the ground truth labels according to documentation

In [10]:
y_train = []
y_test = []
for i in range(10):
    for j in range(660):
        y_train.append(i)
        
for i in range(10):
    for j in range(220):
        y_test.append(i)

In [11]:
len(y_train)

6600

In [12]:
len(y_test)

2200

### Create One-Hot Encodings

In [13]:
# create a one hot encoding of the target labels
y = np.zeros( (len(y_train), max(y_train) + 1) )
y[np.arange(len(y_train)), y_train] = 1
y_train = copy.deepcopy(y)

In [14]:
# create a one hot encoding of the target labels
y = np.zeros( (len(y_test), max(y_test) + 1) )
y[np.arange(len(y_test)), y_test] = 1
y_test = copy.deepcopy(y)

# RNN Function

### ReLU activation function

In [46]:
def relu(x):
    return (np.maximum(0, x))

### Softmax derivative function

In [61]:
def df_softmax(x):
    df = []
    for i in range(len(x)):
        val = x[i] * (1 - x[i])
        df.append(val)
    df = np.array(df)
    return df

### ReLU derivative function

In [65]:
def df_relu(x):
    df = []
    for i in range(len(x)):
        if x[i] <= 0:
            df.append(0)
        else:
            df.append(1)
    df = np.array(df)
    return df

### Training Step Function

In [96]:
def RNN_Training(x, y):
    # define some constants
    maxiter = 40
    d = 13
    m = 16
    p = 10
    eta = 1e-5
    # hidden layer activation function - ReLU
    # output layer activation function - softmax
    
    final_output = []
    final_ground_truth = []
    final_loss = []
    # initialize bias vectors
    bh = np.random.rand(m)
    bh = bh * 0.1
    bo = np.random.rand(p)
    bo = bo * 0.1
    #initialize weight matrices
    Wi = np.random.rand(d, m)
    Wi = Wi * 0.1
    Wh = np.random.rand(m, m)
    Wh = Wh * 0.1
    Wo = np.random.rand(m, p)
    Wo = Wo * 0.1
    # iteration counter
    r = 0
    
    while (r < maxiter):
        final_output = []
        final_ground_truth = []
        start_time = time.time()
        
        # iterate through dataset in random order
        s = list(range(len(x)))
        random.shuffle(s)
        for i in s:
            
            ### FEED FORWARD PHASE ###
            sequence = x[i]
            tau = len(sequence)
            
            # initialize hidden state
            h = np.zeros((tau, m))
            # iteratively calculate hidden state
            for j in range(tau):
                t1 = np.dot(Wi.T, sequence[j]) + np.dot(Wh.T, h[j-1]) + bh
                h[j] = relu(t1)
            # compute output layer
            o = softmax(np.dot(Wo.T, h[tau-1]) + bo)
            
            # if it's the final iteration, save the softmax output and truth labels
            #if r == maxiter-1:
            if True:    
                final_output.append(o.copy())
                final_ground_truth.append(copy.deepcopy(y[i]))
            
            # calculate final loss for each data point 
            if r == maxiter - 1:
                summ = 0
                for j in range(p):
                    prod = y[i][j] * np.log(o[j])
                    summ += prod
                summ = -1 * summ
                final_loss.append(summ)
                
                
            ### BACK PROP PHASE ###
            derivative_loss = o - y[i]
            derivative_softmax = df_softmax(o)
            # net gradients at output
            delta_o = derivative_softmax * derivative_loss 
            # get delta at tau
            t2 = np.dot(Wo, delta_o)
            derivative_relu = df_relu(h[tau-1])
            delta_tau = derivative_relu * t2
            # net gradients at hidden layer
            # computed iteratively
            delta_h = np.zeros((tau, m))
            delta_h[tau-1] = delta_tau
            for j in range(tau-2, -1, -1):
                hidden_partial = df_relu(h[j])
                t3 = np.dot(Wh, delta_h[j+1])
                net_grad = hidden_partial * t3
                delta_h[j] = net_grad
            
            # now get the gradients of weights matrices and bias vectors
            gradient_bo = delta_o.copy()
            
            gradient_wo = np.zeros((m, p))
            prod = np.outer(h[tau-1], delta_o)
            gradient_wo += prod
            
            gradient_bh = np.zeros(m)
            for j in range(tau):
                gradient_bh += delta_h[j]
                
            gradient_wh = np.zeros((m, m))
            for j in range(tau):
                prod = np.outer(h[j-1], delta_h[j])
                gradient_wh += prod
                
            gradient_wi = np.zeros((d, m))
            for j in range(tau):
                prod = np.outer(sequence[j], delta_h[j])
                gradient_wi += prod
            
            # gradient descent
            bo = bo - eta * gradient_bo
            Wo = Wo - eta * gradient_wo
            bh = bh - eta * gradient_bh
            Wh = Wh - eta * gradient_wh
            Wi = Wi - eta * gradient_wi
            
        
        
        r += 1
        print("EPOCH:", r, "Time:", round(time.time()-start_time, 2), "seconds")
    
        # compute the accuracy
        correct = 0
        total = 0
        for i in range(len(final_output)):
            pred = np.argmax(final_output[i])
            actual = np.argmax(final_ground_truth[i])
            if pred == actual:
                correct += 1
            total += 1
        acc = correct / total
        print("Accuracy Score: ", acc)
    
    avg_loss = np.mean(final_loss)
    print("Average Cross Entropy Loss:", avg_loss)
    
    
    
    

### RNN On Training Set

In [92]:
RNN_Training(dataset, y_train)

EPOCH: 1 Time: 5.65 seconds
Accuracy Score:  0.10333333333333333
EPOCH: 2 Time: 5.44 seconds
Accuracy Score:  0.10333333333333333
EPOCH: 3 Time: 5.44 seconds
Accuracy Score:  0.10333333333333333
EPOCH: 4 Time: 5.43 seconds
Accuracy Score:  0.10333333333333333
EPOCH: 5 Time: 5.44 seconds
Accuracy Score:  0.10333333333333333
EPOCH: 6 Time: 5.66 seconds
Accuracy Score:  0.10333333333333333
EPOCH: 7 Time: 5.56 seconds
Accuracy Score:  0.10333333333333333
EPOCH: 8 Time: 5.49 seconds
Accuracy Score:  0.10333333333333333


KeyboardInterrupt: 

### RNN On Testing Set

In [None]:
RNN_Training(testset, y_test)

EPOCH: 1 Time: 1.86 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 2 Time: 1.93 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 3 Time: 2.12 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 4 Time: 1.86 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 5 Time: 1.84 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 6 Time: 1.86 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 7 Time: 1.86 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 8 Time: 1.83 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 9 Time: 1.86 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 10 Time: 1.8 seconds
Accuracy Score:  0.09909090909090909
EPOCH: 11 Time: 1.81 seconds
Accuracy Score:  0.09954545454545455
EPOCH: 12 Time: 1.8 seconds
Accuracy Score:  0.09954545454545455
EPOCH: 13 Time: 1.82 seconds
Accuracy Score:  0.09954545454545455
EPOCH: 14 Time: 1.82 seconds
Accuracy Score:  0.1
EPOCH: 15 Time: 1.8 seconds
Accuracy Score:  0.1
EPOCH: 16 Time: 1.82 seconds
Accuracy Score:  0.1
EPOC