#### String to One Hot

[Referred from here](https://medium.com/@thisislong/building-a-recurrent-neural-network-from-scratch-ba9b27a42856)

In [2]:
import numpy as np
import string
import torch

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

In [47]:
def string_to_one_hot(inputs: np.ndarray) -> np.ndarray:
    char_to_index = {char: i for i, char in enumerate(string.ascii_uppercase)}

    one_hot_inputs = []
    for row in inputs:
        one_hot_list = []
        for char in row:
            if char.upper() in char_to_index:
                one_hot_vector = np.zeros((len(string.ascii_uppercase), 1))
                one_hot_vector[char_to_index[char.upper()]] = 1
                one_hot_list.append(one_hot_vector)
        one_hot_inputs.append(one_hot_list)

    return np.array(one_hot_inputs)

In [48]:
def string_to_one_hot_torch(inputs: np.ndarray) -> torch.Tensor:
    char_to_index = {char: i for i, char in enumerate(string.ascii_uppercase)}

    one_hot_inputs = []
    for row in inputs:
        one_hot_list = []
        for char in row:
            if char.upper() in char_to_index:
                one_hot_vector = torch.zeros((len(string.ascii_uppercase), 1))
                one_hot_vector[char_to_index[char.upper()]] = 1
                one_hot_list.append(one_hot_vector)
        one_hot_inputs.append(torch.stack(one_hot_list))

    return torch.stack(one_hot_inputs).to(device)

### RNN def

In [1]:
class InputLayer:
    inputs: torch.Tensor
    U: torch.Tensor
    delta_U: torch.Tensor

    def __init__(self, inputs: torch.Tensor, hidden_size: int):
        self.inputs = inputs
        self.U = torch.rand(size=(hidden_size, inputs.shape[0]),device=device).uniform_(0,1)
        self.delta_U = torch.zeros_like(self.U, device=device)

    def get_inputs(self, time_step: int) -> torch.Tensor:
        return self.inputs[time_step]
    
    def get_weights(self, time_step:int) -> torch.Tensor:
        return self.U @ self.get_inputs(time_step)
    
    def wieghted_sum(self, time_step:int) -> torch.Tensor:
        return self.U @ self.get_inputs(time_step=time_step)
    
    def calculate_deltas_per_step(self, time_step:int, delta_wieghted_sum:torch.Tensor) -> None:
        self.delta_U += delta_wieghted_sum @ self.get_inputs(time_step=time_step).T

NameError: name 'torch' is not defined

In [None]:
class HiddenLayer:
    states: torch.Tensor
    W: torch.Tensor
    delta_W: torch.Tensor
    bias: torch.Tensor
    delta_bias: torch.Tensor
    next_delta_activation: torch.Tensor
    
    def __init__(self,vocab_size:int, size:int) -> None:
        self.W = torch.rand(size=(size,size), device=device).uniform_(0,1)
        self.bias = torch.rand(size=(size,1), device=device).uniform_(0,1)
        self.states = torch.zeros(size=(vocab_size,size,1), device=device)
        self.next_delta_activation = torch.zeros_like(self.bias, device=device)
        self.delta_bias = torch.zeros_like(self.bias, device=device)
        self.delta_W = torch.zeros_like(self.W, device=device)
    
    def get_hidden_states(self, time_step:int) -> torch.Tensor:
        if time_step < 0:
            return torch.zeros_like(self.states[0])
        return self.states[time_step]
    
    def set_hidden_state(self, time_step:int, hidden_state:torch.Tensor) -> None:
        self.states[time_step] = hidden_state
    
    def activate(self, wieghted_input:torch.Tensor, time_step:int) -> torch.Tensor:
        previous_hidden_state = self.get_hidden_states(time_step=time_step)
        # W @ h_prev => (h_dim, h_dim) @ (h_dim,1 )
        wieghted_hidden_state = self.W @ previous_hidden_state
        # (h_dim, 1) + (h_dim, 1) + (h_dim, 1) = (h_dim, 1) 
        wieghted_sum = wieghted_input + wieghted_hidden_state + self.bias
        activation = torch.tanh(wieghted_sum)
        self.set_hidden_state(time_step=time_step, hidden_state=activation)
        return activation
    
    def calculate_deltas_per_step(self, time_step: int, delta_output: torch.Tensor) -> torch.Tensor:
        # (h_dim,1) + (h_dim,1) = (h_dim,1)
        delta_activation = delta_output + self.next_delta_activation
        # (h_dim,1) * scaler = (h_dim,1)
        delta_wieghted_sum = delta_activation * ( 1 - self.get_hidden_states(time_step=time_step) ** 2)
        # (h_dim, h_dim) @ (h_dim, 1) = (h_dim, 1)
        self.next_delta_activation = self.W.T @ delta_wieghted_sum
        
        # (h_dim, 1) @ (1, h_dim) = (h_dim, h_dim)
        self.delta_W += delta_wieghted_sum @ self.get_hidden_states(time_step=time_step-1).T
        
        # derivative of hidden bias is same as dL_ds
        self.delta_bias += delta_wieghted_sum
        return delta_wieghted_sum
    
    def update_wieght_and_bias(self, learning_rate:float) -> None:
         self.W -= learning_rate * self.delta_W
         self.bias -= learning_rate * self.delta_bias
        

In [None]:
class OutputLayer:
    prediction_state: torch.Tensor
    V: torch.Tensor
    bias: torch.Tensor
    delta_bias: torch.Tensor
    delta_V: torch.Tensor
    
    def __init__(self, size:int, hidden_size:int) -> None:
        self.V = torch.rand(size=(size,hidden_size),device=device).uniform_(0,1)
        self.bias = torch.rand(size=(size,1),device=device).uniform_(0,1)
        self.prediction_state = torch.zeros(size=(size,size,1),device=device)
        self.delta_bias = torch.zeros_like(self.bias,device=device)
        self.delta_V = torch.zeros_like(self.V,device=device)
    
    def set_state(self, time_step:int, prediction:torch.Tensor) -> None:
        self.prediction_state[time_step] = prediction
    
    def get_state(self, time_step:int) -> torch.Tensor:
        return self.prediction_state[time_step]
    
    def predict(self, hidden_state:torch.Tensor, time_step:int) -> torch.Tensor:
        # V @ h => (input_size, h_dimension) @ (h_dimension,1) = (input_size,1)
        # (input_size,1) + (input_size,1) = (input_size,1)
        output = self.V @ hidden_state + self.bias
        prediction = torch.softmax(input=output,dim=0)
        self.set_state(time_step=time_step, prediction=prediction)
        return prediction
    
    def calculate_deltas_per_step(
        self, 
        expected: torch.Tensor,
        hidden_state: torch.Tensor,
        time_step: int
        ) -> torch.Tensor:
        # dL_dO = dL_dyhat * dyh at_do = derivative of loss * derivative of softmax
        # dL_dO = step.y_hat - expected[step_number]
        delta_output = self.get_state(time_step=time_step) - expected # (input_size,1)
        # (input_size,1) (1, hidden_size) = (input_size, hidden_size)
        self.delta_V += delta_output @ hidden_state.T
        
        # dL_dc += dL_dof
        self.delta_bias += delta_output 
        return self.V.T @delta_output
    
    def updates_wieght_and_bias(self, learning_rate:float) -> None:
        self.V -= learning_rate * self.delta_V
        self.bias -= learning_rate * self.delta_bias
    

In [None]:
class VanillaRNN:
    hidden_layer: HiddenLayer
    output_layer: OutputLayer
    alpha: float # learning rate
    input_layer: InputLayer
    
    def __init__(self, vocab_size:int, hidden_size:int, alpha:float) -> None:
        self.hidden_layer = HiddenLayer(vocab_size=vocab_size,size= hidden_size)
        self.output_layer = OutputLayer(size=vocab_size, hidden_size=hidden_size)
        self.hidden_size = hidden_size
        self.alpha = alpha
    
    def feed_forward(self, inputs: torch.Tensor) -> OutputLayer:
        self.input_layer = InputLayer(inputs=inputs, hidden_size=self.hidden_size)
        for step in range(len(inputs)):
            weighted_input = self.input_layer.wieghted_sum(step)
            activation = self.hidden_layer.activate(wieghted_input=weighted_input, time_step=step)
            self.output_layer.predict(activation, step)
        return self.output_layer
    
    def backpropogation(self, excepted: torch.Tensor) -> None:
        for step_number in reversed(range(len(excepted))):
            delta_output = self.output_layer.calculate_deltas_per_step(
                expected=excepted[step_number],
                hidden_state=self.hidden_layer.get_hidden_states(step_number),
                time_step=step_number,
            )
            delta_wieghted_sum = self.hidden_layer.calculate_deltas_per_step(
                step_number,
                delta_output=delta_output
            )
            self.input_layer.calculate_deltas_per_step(step_number, delta_wieghted_sum=delta_wieghted_sum)
            
            self.output_layer.updates_wieght_and_bias(self.alpha)
            self.hidden_layer.update_wieght_and_bias(self.alpha)
            self.hidden_layer.update_wieght_and_bias(self.alpha)
    
    def loss(self, y_hat:torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        return torch.sum(- torch.sum(y) * torch.log(y_hat))
    
    def train(self, inputs: torch.Tensor, excepted: torch.Tensor, epochs:int) -> None:
        for epoch in range(epochs):
            print(f"Epoch {epoch}")
            for idx, input in enumerate(inputs):
                y_hats = self.feed_forward(input)
                self.backpropogation(excepted=excepted[idx])
                y_hat = y_hats.prediction_state[idx]
                y = excepted[idx]
                print(f"Shapes: y_hat: {y_hat.shape}, y: {y.shape}")
                for i in range(len(y)):
                    print(f"i: {i}, Loss: {self.loss(y_hat= y_hat, y=y)}")

In [None]:
inputs = np.array([
    ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"],
    ["Z","Y","X","W","V","U","T","S","R","Q","P","O","N","M","L","K","J","I","H","G","F","E","D","C","B","A"],
    ["B","D","F","H","J","L","N","P","R","T","V","X","Z","A","C","E","G","I","K","M","O","Q","S","U","W","Y"],
    ["M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H","I","J","K","L"],
    ["H","G","F","E","D","C","B","A","L","K","J","I","P","O","N","M","U","T","S","R","Q","X","W","V","Z","Y"]
])
expected = np.array([
      ["B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A"],
      ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"],
      ["C","E","G","I","K","M","O","Q","S","U","W","Y","A","B","D","F","H","J","L","N","P","R","T","V","X","Z"], 
      ["N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H","I","J","K","L","M"],
      ["I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","A","B","C","D","E","F","G","H"]
  ])

In [None]:
on_hot_inputs = string_to_one_hot_torch(inputs)
on_hot_expected = string_to_one_hot_torch(expected)
print(on_hot_inputs.shape)
print(on_hot_expected.shape)


torch.Size([5, 26, 26, 1])
torch.Size([5, 26, 26, 1])


In [None]:
rnn = VanillaRNN(vocab_size=26, hidden_size=128, alpha=0.01)
rnn.train(inputs=on_hot_inputs, excepted=on_hot_expected, epochs=10)

Epoch 0
Shapes: y_hat: torch.Size([26, 1]), y: torch.Size([26, 26, 1])
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Loss: 4063.133056640625
Shapes: y_hat: torch.Size([26, 1]), y: torch.Size([26, 26, 1])
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Lo

In [None]:
l = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
l[12:-1]

[12, 13, 14]