## LSTM
[From this page](https://medium.com/@CallMeTwitch/building-a-neural-network-zoo-from-scratch-the-long-short-term-memory-network-1cec5cf31b7)

In [56]:
##### Imports #####
from tqdm import tqdm
import torch
import numpy as np
from d2l import torch as d2l

In [59]:
print(d2l.try_gpu())
print(d2l.gpu())
print(d2l.num_gpus())
device = d2l.try_gpu()

cuda:0
cuda:0
1


In [60]:
##### Data #####
data = """To be, or not to be, that is the question: Whether \
'tis nobler in the mind to suffer The slings and arrows of ou\
trageous fortune, Or to take arms against a sea of troubles A\
nd by opposing end them. To die—to sleep, No more; and by a s\
leep to say we end The heart-ache and the thousand natural sh\
ocks That flesh is heir to: 'tis a consummation Devoutly to b\
e wish'd. To die, to sleep; To sleep, perchance to dream—ay, \
there's the rub: For in that sleep of death what dreams may c\
ome, When we have shuffled off this mortal coil, Must give us\
 pause—there's the respect That makes calamity of so long lif\
e. For who would bear the whips and scorns of time, Th'oppres\
sor's wrong, the proud man's contumely, The pangs of dispriz'\
d love, the law's delay, The insolence of office, and the spu\
rns That patient merit of th'unworthy takes, When he himself \
might his quietus make""".lower()

chars = set(data)

data_size, char_size = len(data), len(chars)

print(f'Data size: {data_size}, Char Size: {char_size}')

char_to_idx = {c:i for i, c in enumerate(chars)}
idx_to_char = {i:c for i, c in enumerate(chars)}

train_X, train_y = data[:-1], data[1:]

Data size: 866, Char Size: 32


In [61]:
print(char_to_idx)
print(idx_to_char)

{'-': 0, ' ': 1, 'c': 2, 'a': 3, ';': 4, ',': 5, '.': 6, 't': 7, 'e': 8, 'u': 9, '—': 10, 'd': 11, 'r': 12, 'f': 13, 'p': 14, 'n': 15, 'k': 16, 'b': 17, "'": 18, 'm': 19, 'h': 20, ':': 21, 'v': 22, 'q': 23, 'z': 24, 'o': 25, 'w': 26, 's': 27, 'i': 28, 'y': 29, 'l': 30, 'g': 31}
{0: '-', 1: ' ', 2: 'c', 3: 'a', 4: ';', 5: ',', 6: '.', 7: 't', 8: 'e', 9: 'u', 10: '—', 11: 'd', 12: 'r', 13: 'f', 14: 'p', 15: 'n', 16: 'k', 17: 'b', 18: "'", 19: 'm', 20: 'h', 21: ':', 22: 'v', 23: 'q', 24: 'z', 25: 'o', 26: 'w', 27: 's', 28: 'i', 29: 'y', 30: 'l', 31: 'g'}


In [73]:
##### Helper Functions #####
def oneHotEncode(text):
    output = torch.zeros((char_size, 1),device=device)
    output[char_to_idx[text]] = 1

    return output

# Xavier Normalized Initialization
def initWeights(input_size:int, output_size:int) -> torch.Tensor:
    return torch.rand((output_size, input_size),device=device).uniform_(-1, 1) # * np.sqrt(6 / (input_size + output_size))

In [75]:
train_X_onehot = [oneHotEncode(c) for c in train_X]

In [83]:
type(train_X_onehot), train_X_onehot[0].shape

(list, torch.Size([32, 1]))

In [63]:
def sigmoid(input: torch.Tensor, derivative=False):
    if derivative:
        return input * (1 - input)
    return 1 / (1 + torch.exp(-input))

def tanh(input:torch.Tensor, derivative=False):
    if derivative:
        return 1 - input ** 2
    return torch.tanh(input)

def softmax(input:torch.Tensor):
    return torch.exp(input) / torch.sum(torch.exp(input), dim=0)

In [71]:
class LSTM:
    
    def __init__(self, input_size, hidden_size, output_size,num_epochs, learning_rate=0.01):
        # Hyperparameters
        self.learning_rate = learning_rate
        self.hidden_size = hidden_size
        self.num_epochs = num_epochs

        # Forget Gate
        self.W_f = initWeights(input_size, hidden_size)
        self.b_f = torch.zeros((hidden_size, 1),device=device)
        
        # Input Gate
        self.W_i = initWeights(input_size, hidden_size)
        self.b_i = torch.zeros((hidden_size, 1),device=device)
        
        # Candidate Gate
        self.W_c = initWeights(input_size, hidden_size)
        self.b_c = torch.zeros((hidden_size, 1),device=device)
        
        # Output Gate
        self.W_o = initWeights(input_size, hidden_size)
        self.b_o = torch.zeros((hidden_size, 1),device=device)
        
        # Hidden to Output
        self.W_y = initWeights(hidden_size, output_size)
        self.b_y = torch.zeros((output_size, 1),device=device)
    def reset(self):
        self.concat_inputs = torch.Tensor()
        
        self.hidden_states = {-1: torch.zeros((self.hidden_size, 1),device=device)}
        self.cell_states = {-1: torch.zeros((self.hidden_size, 1),device=device)}
        
        self.activation_outputs = torch.Tensor(device=device)
        self.cadidate_gates = torch.Tensor(device=device)
        self.input_gates = torch.Tensor(device=device)
        self.forget_gates = torch.Tensor(device=device)
        self.output_gates = torch.Tensor(device=device)
        self.outputs = torch.Tensor(device=device)
        
    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        self.reset()
        outputs = torch.Tensor()
        for t in range(len(inputs)):
            self.concat_inputs[t] = torch.concatenate((self.hidden_states[t-1], inputs[t]))      
            
            self.forget_gates[t] = sigmoid(self.W_f @ self.concat_inputs[t] + self.b_f)
            self.input_gates[t] = sigmoid(self.W_i @ self.concat_inputs[t] + self.b_i)
            self.cadidate_gates[t] = tanh(self.W_c @ self.concat_inputs[t] + self.b_c)
            self.output_gates[t] = sigmoid(self.W_o @ self.concat_inputs[t] + self.b_o)
           
            self.cell_states[t] = self.forget_gates[t] * self.cell_states[t-1] + self.input_gates[t] * self.cadidate_gates[t]
            self.hidden_states[t] = self.output_gates[t] * tanh(self.cell_states[t])
            
            output = self.W_y @ self.hidden_states[t] + self.b_y
            torch.cat((outputs, output),dim=0)
            
        return outputs 
        
    # Backpropagation Through Time
    def backward(self, inputs:torch.Tensor, errors:torch.Tensor) -> None:
        d_wf,d_bf = torch.zeros(1,device=device),torch.zeros(1,device=device)
        d_wi,d_bi = torch.zeros(1,device=device),torch.zeros(1,device=device)
        d_wo,d_bo = torch.zeros(1,device=device),torch.zeros(1,device=device)
        d_wc,d_bc = torch.zeros(1,device=device),torch.zeros(1,device=device)
        d_wy,d_by = torch.zeros(1,device=device),torch.zeros(1,device=device)
        
        dh_next, dc_next = torch.zeros_like(self.hidden_states[0],device=device), torch.zeros_like(self.cell_states[0],device=device)
        for t in reversed(range(len(inputs))):
            error = errors[t]
            # Output layer gradients
            
            # Final Gate Weights and Biases Errors
            d_wy += error @ self.hidden_states[t].T
            d_by += error
            
            # Hidden State Error
            d_hs = self.W_y.T @ error + dh_next
            
            # Output Gate Weights and Biases Errors
            # Why tanh don't have derivative=True?
            d_o = tanh(self.cell_states[t]) * d_hs * sigmoid(self.output_gates[t], derivative=True)
            d_wo += d_o @ inputs[t].T
            d_bo += d_o
            
            # Cell State Error
            d_cs = tanh(tanh(self.cell_states[t]), derivative=True) * self.output_gates[t] * d_hs + dc_next
            
            # forget Gate Weights and Biases Errors
            d_f = d_cs * self.cell_states[t-1] * sigmoid(self.forget_gates[t], derivative=True)
            d_wf = d_f @ inputs[t].T
            d_bf += d_f
            
            # Input Gate Weights and Biases Errors
            d_i = d_cs * self.cadidate_gates[t] * sigmoid(self.input_gates[t], derivative=True)
            d_wi += d_i @ inputs[t].T
            d_bi += d_i
            
            # Candidate Gate Weights and Biases Errors
            d_c = d_cs * self.input_gates[t] * tanh(self.cadidate_gates[t], derivative=True)
            d_wc += d_c @ inputs[t].T
            d_bc += d_c
            
            # Concanted Input Error (Sum of Error for each gate)
            d_z = self.W_f.T @ d_f + self.W_i.T @ d_i + self.W_c.T @ d_c + self.W_o.T @ d_o
            
            # Error of Hidden State and Cell State at Next Time Step
            dh_next = d_z[:self.hidden_size, :]
            dc_next = self.forget_gates[t] * d_cs
            
        for d_ in [d_wf,d_bf,d_wi,d_bi,d_wo,d_bo,d_wc,d_bc,d_wy,d_by]:
            torch.clamp(d_, min = -1,max= 1, out=d_)
            
        # Update Weights and Biases
        
        self.W_f += self.learning_rate * d_wf
        self.b_f += self.learning_rate * d_bf
        
        self.W_i += self.learning_rate * d_wi
        self.b_i += self.learning_rate * d_bi
        
        self.W_o += self.learning_rate * d_wo
        self.b_o += self.learning_rate * d_bo
        
        self.W_c += self.learning_rate * d_wc
        self.b_c += self.learning_rate * d_bc
        
        self.W_y += self.learning_rate * d_wy
        self.b_y += self.learning_rate * d_by    
        
    # Training the Model
    def train(self, inputs: torch.Tensor, targets: torch.Tensor) -> None:
        
        for _ in tqdm(range(self.num_epochs)):
            predictions = self.forward(inputs)
            
            errors = torch.Tensor(device=device)
            
            for t in range(len(predictions)):
                errors = torch.cat((errors, -softmax(predictions[t])), dim=0)
                errors[-1][targets[t]] += 1
            self.backward(errors, self.concat_inputs)

    # Test

    def test(self, inputs: torch.Tensor, targets: torch.Tensor) -> None:
        accuracy = 0
        probabilities = self.forward(inputs)
        
        output = []
        
        for q in range(len(targets)):
            prediction = softmax(probabilities[q])
            output += prediction
            
            if torch.argmax(prediction) == targets[q]:
                accuracy += 1
            
            print(f'Ground Truth: {targets[q]}, Prediction: {torch.argmax(prediction).item()}')
            print(f'Accuracy: {accuracy / len(targets) * 100}%')

In [74]:
# Initialize Network
hidden_size = 25

lstm = LSTM(input_size = char_size + hidden_size, hidden_size = hidden_size, output_size = char_size, num_epochs = 1_000, learning_rate = 0.05)


In [None]:
train_X_onehot = [oneHotEncode(c) for c in train_X]
train_y_onehot = [oneHotEncode(c) for c in train_y]

In [None]:
##### Training #####
lstm.train(train_X_onehot, train_y_onehot)

In [44]:
A_np = np.array([[1, 2], [3, 4]])
B_np = np.array([[5, 6], [7, 8]])
result_np = np.dot(A_np, B_np)
print(f"NumPy 2D dot product (matrix multiplication): \n{result_np}")

A_torch = torch.tensor([[1, 2], [3, 4]])
B_torch = torch.tensor([[5, 6], [7, 8]])
result_torch_matmul = torch.matmul(A_torch, B_torch)
result_torch_at = A_torch @ B_torch
print(f"PyTorch 2D @ operator: \n{result_torch_at}")
print(f"PyTorch 2D matmul: \n{result_torch_matmul}")

NumPy 2D dot product (matrix multiplication): 
[[19 22]
 [43 50]]
PyTorch 2D @ operator: 
tensor([[19, 22],
        [43, 50]])
PyTorch 2D matmul: 
tensor([[19, 22],
        [43, 50]])


In [50]:
A_torch = torch.cat((A_torch, B_torch), dim=0)


In [54]:

A_torch[-1]

tensor([7, 8])