# Train Neural Network (MLP for NLP)

In [1]:
import numpy as np
import pandas as pd

## Pretrained Word2Vec word vectors

In [2]:
# Word vector dimension = 2 (instead of 300)
word2vec_dict = {
    "i": [0.02, 0.03],
    "like": [0.01, 0.9],
    "hate": [0.9, 0.01],
    "ramen": [0.03, 0.9],
    "sushi": [0.03, 0.89],
    "steak": [0.9, 0.03],
    "bbq": [0.89, 0.03],
    "in": [0.01, 0.3],
    "tokyo": [0.5, 0.1],
    "texas": [0.1, 0.5]
}

## String to Integer Mapping

In [3]:
# Typically padding is 0 and out-of-vocabulary word is 1
stoi = {
    "PAD": 0,
    "UNK": 1,
    "i": 2,
    "like": 3,
    "hate": 4,
    "ramen": 5,
    "sushi": 6,
    "steak": 7,
    "bbq": 8,
    "in": 9,
    "tokyo": 10,
    "texas": 11
}

## Load text data

In [4]:
df = pd.read_csv('data/data.csv', sep='\t')
df

Unnamed: 0,document,label
0,i like ramen in tokyo,1
1,i like sushi in tokyo,1
2,i hate ramen in tokyo,0
3,i hate sushi in tokyo,0
4,i like steak in texas,0
5,i like bbq in texas,0
6,i hate steak in texas,1
7,i hate bbq in texas,1


## Tokenize (convert each word into integer)

In [5]:
# Use white space to tokenize. No padding is necessary as all documents have exactly 5 words.
df['X'] = df.apply(lambda x: [stoi[i] for i in x.document.split(" ")], axis=1)

In [6]:
df[['X', 'label']]

Unnamed: 0,X,label
0,"[2, 3, 5, 9, 10]",1
1,"[2, 3, 6, 9, 10]",1
2,"[2, 4, 5, 9, 10]",0
3,"[2, 4, 6, 9, 10]",0
4,"[2, 3, 7, 9, 11]",0
5,"[2, 3, 8, 9, 11]",0
6,"[2, 4, 7, 9, 11]",1
7,"[2, 4, 8, 9, 11]",1


## Convert each word integer into their word vector representation

In [7]:
# Integer to string mapping
itos = {v: k for k, v in stoi.items()}

In [8]:
df['X'] = df.apply(lambda x: [word2vec_dict[itos[i]] for i in x.X], axis=1)

In [9]:
df[['X', 'label']]

Unnamed: 0,X,label
0,"[[0.02, 0.03], [0.01, 0.9], [0.03, 0.9], [0.01...",1
1,"[[0.02, 0.03], [0.01, 0.9], [0.03, 0.89], [0.0...",1
2,"[[0.02, 0.03], [0.9, 0.01], [0.03, 0.9], [0.01...",0
3,"[[0.02, 0.03], [0.9, 0.01], [0.03, 0.89], [0.0...",0
4,"[[0.02, 0.03], [0.01, 0.9], [0.9, 0.03], [0.01...",0
5,"[[0.02, 0.03], [0.01, 0.9], [0.89, 0.03], [0.0...",0
6,"[[0.02, 0.03], [0.9, 0.01], [0.9, 0.03], [0.01...",1
7,"[[0.02, 0.03], [0.9, 0.01], [0.89, 0.03], [0.0...",1


## Separate into feature and label

In [10]:
X = np.array([i for i in df.X.values])

# (batch, sequence, word vector dimension)
X.shape

(8, 5, 2)

In [11]:
# Look at the first document
X[0]

array([[0.02, 0.03],
       [0.01, 0.9 ],
       [0.03, 0.9 ],
       [0.01, 0.3 ],
       [0.5 , 0.1 ]])

In [12]:
# One-hot encode labels
y = np.array([[0., 1.] if i == 1 else [1., 0.] for i in df.label.values])

In [13]:
y

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

## Activations

In [14]:
# Softmax function for classification
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) 
    return np.exp(x) / np.sum(np.exp(x))

# ReLu for forward
def relu(x):
    return np.maximum(0, x)

# ReLu for backward
def relu_grad(x):
    grad = np.zeros(x.shape)
    grad[x>=0] = 1
    return grad

## Loss function

In [15]:
# Cross entropy loss
def cross_entropy_error(y, t):
    delta = 1e-7
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + delta)) / batch_size

## Architecture

In [16]:
class MLP:
    # Initialize weights (random) and biases (zeros)
    def __init__(self, input_size:int, hidden_size:int, output_size:int, weight_init_std:float=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    # Predict function
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
    
        a1 = np.dot(x, W1) + b1
        z1 = relu(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        return y
    
    # Loss function
    def loss(self, x, t):
        y = self.predict(x)
        
        return cross_entropy_error(y, t)
    
    # Metrics
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # Gradient
    def gradient(self, x, t):
        # For the first iteration, use the initialized weights and biases. Subsequently, use the updated weights and biases.
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        
        # Initialize parameter dictionary
        grads = {}
        grads['W1'] = None
        grads['b1'] = None
        grads['W2'] = None
        grads['b2'] = None
        
        batch_num = x.shape[0]
        
        # Forward
        a1 = np.dot(x, W1) + b1
        z1 = relu(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        print("Prediction: {}".format(y))
        
        # Back-propagation
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)
        
        dz1 = np.dot(dy, W2.T)
        da1 = relu_grad(a1) * dz1
        grads['W1'] = np.dot(x.T, da1)
        grads['b1'] = np.sum(da1, axis=0)

        return grads

## Initialize Network

In [17]:
learning_rate = 0.1
network = MLP(input_size=10, hidden_size=5, output_size=2)

In [18]:
# Flatten X for fully connected layer
X = np.reshape(X, (X.shape[0], -1))

## Train

In [19]:
def train_1_epoch(X, y):
    # Calculate gradient
    grad = network.gradient(X, y)
    print("Weights and biases: ")
    print(grad)
    
    # Update parameters
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # Calculate loss
    loss = network.loss(X, y)
    print("Loss: "+str(loss))
    
    # Calcuate accuracy
    accuracy = network.accuracy(X, y)
    print("Accuracy: "+str(accuracy))

### Train 10 epochs

In [20]:
for i in range(10):
    print("Epoch "+str(i+1))
    train_1_epoch(X, y)
    print("\n")

Epoch 1
Prediction: [[0.50010427 0.49989573]
 [0.50010362 0.49989638]
 [0.50011306 0.49988694]
 [0.50011246 0.49988754]
 [0.50004581 0.49995419]
 [0.50004562 0.49995438]
 [0.50002878 0.49997122]
 [0.50002889 0.49997111]]
Weights and biases: 
{'W1': array([[ 2.37450047e-08,  3.82628329e-05, -1.23545516e-05,
         0.00000000e+00,  3.43637921e-06],
       [ 3.56175071e-08,  5.73942494e-05, -1.85318274e-05,
         0.00000000e+00,  5.15456881e-06],
       [ 5.25576944e-07,  1.96133613e-05, -6.17727579e-06,
         0.00000000e+00,  1.71818960e-06],
       [ 5.54820770e-07,  1.72134554e-03, -5.55954821e-04,
         0.00000000e+00,  1.54637064e-04],
       [ 2.98477338e-07,  1.71154490e-03, -5.52866183e-04,
         0.00000000e+00,  5.15456881e-06],
       [ 7.99742680e-07,  5.81111328e-05, -1.85318274e-05,
         0.00000000e+00,  1.53777970e-04],
       [ 1.18725024e-08,  1.91314165e-05, -6.17727579e-06,
         0.00000000e+00,  1.71818960e-06],
       [ 3.56175071e-07,  5.73942494e