# Train Neural Network (RNN for NLP)

In [1]:
import numpy as np
import pandas as pd

## Pretrained Word2Vec word vectors

In [2]:
# Word vector dimension = 2 (instead of 300)
word2vec_dict = {
    "i": [0.02, 0.03],
    "like": [0.01, 0.9],
    "hate": [0.9, 0.01],
    "ramen": [0.03, 0.9],
    "sushi": [0.03, 0.89],
    "steak": [0.9, 0.03],
    "bbq": [0.89, 0.03],
    "in": [0.01, 0.3],
    "tokyo": [0.5, 0.1],
    "texas": [0.1, 0.5]
}

## String to Integer Mapping

In [3]:
# Typically padding is 0 and out-of-vocabulary word is 1
stoi = {
    "PAD": 0,
    "UNK": 1,
    "i": 2,
    "like": 3,
    "hate": 4,
    "ramen": 5,
    "sushi": 6,
    "steak": 7,
    "bbq": 8,
    "in": 9,
    "tokyo": 10,
    "texas": 11
}

## Load text data

In [4]:
df = pd.read_csv('data/data.csv', sep='\t')
df

Unnamed: 0,document,label
0,i like ramen in tokyo,1
1,i like sushi in tokyo,1
2,i hate ramen in tokyo,0
3,i hate sushi in tokyo,0
4,i like steak in texas,0
5,i like bbq in texas,0
6,i hate steak in texas,1
7,i hate bbq in texas,1


## Tokenize (convert each word into integer)

In [5]:
# Use white space to tokenize. No padding is necessary as all documents have exactly 5 words.
df['X'] = df.apply(lambda x: [stoi[i] for i in x.document.split(" ")], axis=1)

In [6]:
df[['X', 'label']]

Unnamed: 0,X,label
0,"[2, 3, 5, 9, 10]",1
1,"[2, 3, 6, 9, 10]",1
2,"[2, 4, 5, 9, 10]",0
3,"[2, 4, 6, 9, 10]",0
4,"[2, 3, 7, 9, 11]",0
5,"[2, 3, 8, 9, 11]",0
6,"[2, 4, 7, 9, 11]",1
7,"[2, 4, 8, 9, 11]",1


## Convert each word integer into their word vector representation

In [7]:
# Integer to string mapping
itos = {v: k for k, v in stoi.items()}

In [8]:
df['X'] = df.apply(lambda x: [word2vec_dict[itos[i]] for i in x.X], axis=1)

In [9]:
df[['X', 'label']]

Unnamed: 0,X,label
0,"[[0.02, 0.03], [0.01, 0.9], [0.03, 0.9], [0.01...",1
1,"[[0.02, 0.03], [0.01, 0.9], [0.03, 0.89], [0.0...",1
2,"[[0.02, 0.03], [0.9, 0.01], [0.03, 0.9], [0.01...",0
3,"[[0.02, 0.03], [0.9, 0.01], [0.03, 0.89], [0.0...",0
4,"[[0.02, 0.03], [0.01, 0.9], [0.9, 0.03], [0.01...",0
5,"[[0.02, 0.03], [0.01, 0.9], [0.89, 0.03], [0.0...",0
6,"[[0.02, 0.03], [0.9, 0.01], [0.9, 0.03], [0.01...",1
7,"[[0.02, 0.03], [0.9, 0.01], [0.89, 0.03], [0.0...",1


## Separate into feature and label

In [10]:
X = np.array([i for i in df.X.values])

# (batch, sequence, word vector dimension)
X.shape

(8, 5, 2)

In [11]:
# Look at the first document
X[0]

array([[0.02, 0.03],
       [0.01, 0.9 ],
       [0.03, 0.9 ],
       [0.01, 0.3 ],
       [0.5 , 0.1 ]])

In [12]:
# One-hot encode labels
y = np.array([[0., 1.] if i == 1 else [1., 0.] for i in df.label.values])

In [13]:
y

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

## Activations

In [14]:
# Softmax function for classification
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) 
    return np.exp(x) / np.sum(np.exp(x))

# ReLu for forward
def relu(x):
    return np.maximum(0, x)

# ReLu for backward
def relu_grad(x):
    grad = np.zeros(x.shape)
    grad[x>=0] = 1
    return grad

# tanh for backward
def tanh_grad(d, W, h):
    return np.multiply(np.dot(d, W.T), 1 - np.power(h, 2))

## Loss function

In [15]:
# Cross entropy loss
def cross_entropy_error(y, t):
    delta = 1e-7
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + delta)) / batch_size

## Architecture

In [16]:
class RNN:
    # Initialize weights (random) and biases (zeros)
    # Wx (input to hidden)
    # Wh (hidden to hidden)
    def __init__(self, word_vector_size:int, hidden_size:int, output_size:int, weight_init_std:float=0.01):
        self.params = {}
        self.params['Wx1'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['Wh1'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['Wx2'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['Wh2'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['b2'] = np.zeros(hidden_size)
        self.params['Wx3'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['Wh3'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['b3'] = np.zeros(hidden_size)
        self.params['Wx4'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['Wh4'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['b4'] = np.zeros(hidden_size)
        self.params['Wx5'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['Wh5'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['b5'] = np.zeros(hidden_size)
        self.params['W6'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b6'] = np.zeros(output_size)

    # Predict function
    def predict(self, x):
        Wx1, Wx2, Wx3, Wx4, Wx5 = self.params['Wx1'], self.params['Wx2'], self.params['Wx3'], self.params['Wx4'], self.params['Wx5']
        Wh1, Wh2, Wh3, Wh4, Wh5 = self.params['Wh1'], self.params['Wh2'], self.params['Wh3'], self.params['Wh4'], self.params['Wh5']
        W6 = self.params['W6']
        b1, b2, b3, b4, b5, b6 = self.params['b1'], self.params['b2'], self.params['b3'], self.params['b4'], self.params['b5'], self.params['b6']

        # 1st word embedding
        x1 = np.dot(x[:,0], Wx1)
        in1 = relu(x1)
        
        # 2nd word embedding
        x2 = np.dot(x[:,1], Wx2)
        in2 = relu(x2)
        
        # 3rd word embedding
        x3 = np.dot(x[:,2], Wx3)
        in3 = relu(x3)
        
        # 4th word embedding
        x4 = np.dot(x[:,3], Wx4)
        in4 = relu(x4)
        
        # 5th word embedding
        x5 = np.dot(x[:,4], Wx5)
        in5 = relu(x5)
        
        h0 = np.zeros(in1.shape)
        h1 = np.dot(h0+in1, Wh1) + b1
        h1 = np.tanh(h1)
        h2 = np.dot(h1+in2, Wh2) + b2
        h2 = np.tanh(h2)
        h3 = np.dot(h2+in3, Wh3) + b3
        h3 = np.tanh(h3)
        h4 = np.dot(h3+in4, Wh4) + b4
        h4 = np.tanh(h4)
        h5 = np.dot(h4+in5, Wh5) + b5
        h5 = np.tanh(h5)
        
        # hidden to output
        a1 = np.dot(h5, W6) + b6
        y = softmax(a1)
        
        return y
    
    # Loss function
    def loss(self, x, t):
        y = self.predict(x)
        
        return cross_entropy_error(y, t)
    
    # Metrics
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # Gradient
    def gradient(self, x, t):
        # For the first iteration, use the initialized weights and biases. Subsequently, use the updated weights and biases.
        Wx1, Wx2, Wx3, Wx4, Wx5 = self.params['Wx1'], self.params['Wx2'], self.params['Wx3'], self.params['Wx4'], self.params['Wx5']
        Wh1, Wh2, Wh3, Wh4, Wh5 = self.params['Wh1'], self.params['Wh2'], self.params['Wh3'], self.params['Wh4'], self.params['Wh5']
        W6 = self.params['W6']
        b1, b2, b3, b4, b5, b6 = self.params['b1'], self.params['b2'], self.params['b3'], self.params['b4'], self.params['b5'], self.params['b6']

        # Initialize parameter dictionary
        grads = {}
        grads['Wx1'] = None
        grads['Wh1'] = None
        grads['b1'] = None
        grads['Wx2'] = None
        grads['Wh2'] = None
        grads['b2'] = None
        grads['Wx3'] = None
        grads['Wh3'] = None
        grads['b3'] = None
        grads['Wx4'] = None
        grads['Wh4'] = None
        grads['b4'] = None
        grads['Wx5'] = None
        grads['Wh5'] = None
        grads['b5'] = None
        grads['W6'] = None
        grads['b6'] = None
        
        batch_num = x.shape[0]
        
        # Forward
        # 1st word embedding
        x1 = np.dot(x[:,0], Wx1)
        in1 = relu(x1)
        
        # 2nd word embedding
        x2 = np.dot(x[:,1], Wx2)
        in2 = relu(x2)
        
        # 3rd word embedding
        x3 = np.dot(x[:,2], Wx3)
        in3 = relu(x3)
        
        # 4th word embedding
        x4 = np.dot(x[:,3], Wx4)
        in4 = relu(x4)
        
        # 5th word embedding
        x5 = np.dot(x[:,4], Wx5)
        in5 = relu(x5)
        
        h0 = np.zeros(in1.shape)
        h1_0 = np.dot(h0+in1, Wh1) + b1
        h1 = np.tanh(h1_0)
        h2_0 = np.dot(h1+in2, Wh2) + b2
        h2 = np.tanh(h2_0)
        h3_0 = np.dot(h2+in3, Wh3) + b3
        h3 = np.tanh(h3_0)
        h4_0 = np.dot(h3+in4, Wh4) + b4
        h4 = np.tanh(h4_0)
        h5_0 = np.dot(h4+in5, Wh5) + b5
        h5 = np.tanh(h5_0)
        
        # hidden to output
        a1 = np.dot(h5, W6) + b6
        y = softmax(a1)
        print("Prediction: {}".format(y))
    
        # Back-propagation
        dy = (y - t) / batch_num
        dW6 = np.dot(h5.T, dy)
        db6 = np.sum(dy, axis=0)
        grads['W6'] = dW6
        grads['b6'] = db6

        dz5 = tanh_grad(dy, W6, h5)
        dWh5 = np.dot((h4+in5).T, dz5)
        db5 = np.sum(dz5, axis=0)
        dx5 = relu_grad(x5) * np.dot(dz5, Wh5.T)
        dWx5 = np.dot(X[:,4].T, dx5)
        grads['Wh5'] = dWh5
        grads['b5'] = db5
        grads['Wx5'] = dWx5

        dz4 = tanh_grad(dz5, Wh5, h4)
        dWh4 = np.dot((h3+in4).T, dz4)
        db4 = np.sum(dz4, axis=0)
        dx4 = relu_grad(x4) * np.dot(dz4, Wh4.T)
        dWx4 = np.dot(X[:,3].T, dx4)
        grads['Wh4'] = dWh4
        grads['b4'] = db4
        grads['Wx4'] = dWx4

        dz3 = tanh_grad(dz4, Wh4, h3)
        dWh3 = np.dot((h2+in3).T, dz3)
        db3 = np.sum(dz3, axis=0)
        dx3 = relu_grad(x3) * np.dot(dz3, Wh3.T)
        dWx3 = np.dot(X[:,2].T, dx3)
        grads['Wh3'] = dWh3
        grads['b3'] = db3
        grads['Wx3'] = dWx3

        dz2 = tanh_grad(dz3, Wh3, h2)
        dWh2 = np.dot((h1+in2).T, dz2)
        db2 = np.sum(dz2, axis=0)
        dx2 = relu_grad(x2) * np.dot(dz2, Wh2.T)
        dWx2 = np.dot(X[:,1].T, dx2)
        grads['Wh2'] = dWh2
        grads['b2'] = db2
        grads['Wx2'] = dWx2

        dz1 = tanh_grad(dz2, Wh2, h1)
        dWh1 = np.dot((h0+in1).T, dz1)
        db1 = np.sum(dz1, axis=0)
        dx1 = relu_grad(x1) * np.dot(dz1, Wh1.T)
        dWx1 = np.dot(X[:,0].T, dx1)
        grads['Wh1'] = dWh1
        grads['b1'] = db1
        grads['Wx1'] = dWx1
        
        return grads

## Initialize Network

In [17]:
learning_rate = 0.1
network = RNN(word_vector_size=2, hidden_size=4, output_size=2)

## Train

In [18]:
def train_1_epoch(X, y):
    # Calculate gradient
    grad = network.gradient(X, y)
    print("Weights and biases: ")
    print(grad)
    
    # Update parameters
    for key in ('Wx1', 'Wh1', 'b1', 'Wx2', 'Wh2', 'b2', 'Wx3', 'Wh3', 'b3', 'Wx4', 'Wh4', 'b4', 'Wx5', 'Wh5', 'b5', 'W6', 'b6'):
        network.params[key] -= learning_rate * grad[key]
    
    # Calculate loss
    loss = network.loss(X, y)
    print("Loss: "+str(loss))
    
    # Calcuate accuracy
    accuracy = network.accuracy(X, y)
    print("Accuracy: "+str(accuracy))

### Train 10 epochs

In [19]:
for i in range(10):
    print("Epoch "+str(i+1))
    train_1_epoch(X, y)
    print("\n")

Epoch 1
Prediction: [[0.5000003 0.4999997]
 [0.5000003 0.4999997]
 [0.5000003 0.4999997]
 [0.5000003 0.4999997]
 [0.5       0.5      ]
 [0.5       0.5      ]
 [0.5       0.5      ]
 [0.5       0.5      ]]
Weights and biases: 
{'Wx1': array([[ 0.00000000e+00,  1.40523522e-19, -9.49859697e-20,
        -2.70085443e-20],
       [ 0.00000000e+00,  2.10785284e-19, -1.42478955e-19,
        -4.05128165e-20]]), 'Wh1': array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [-6.03120176e-21, -3.58279239e-20, -2.94442652e-20,
        -5.91174445e-21],
       [-2.79935318e-20, -1.66293579e-19, -1.36664135e-19,
        -2.74390765e-20],
       [-3.78096950e-20, -2.24605797e-19, -1.84586544e-19,
        -3.70608154e-20]]), 'b1': array([-4.37654816e-17, -2.59985722e-16, -2.13662632e-16, -4.28986383e-17]), 'Wx2': array([[-1.99426733e-17, -1.15667516e-16,  0.00000000e+00,
         0.00000000e+00],
       [-2.21585259e-19, -1.28519462e-18,  0.00000000e+00,
         0.