# LSTM RNN


In [2]:
# libraries
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import defaultdict


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device.type

'cpu'

In [4]:
# eng -> german sentence pairs
pairs = [
    ["i am a student", "ich bin ein student"],
    ["he is a teacher", "er ist ein lehrer"],
    ["she is happy", "sie ist glücklich"],
]

In [None]:
def tokenize(sentence:str)->(list):
    return sentence.lower().split() 

def build_vocab(sentences:list)->(dict):
    # building a vocabullary so each word had an index
    # <pad> - to align sentences
    # <sos> - "start of sentence" (inserted before each sentence)
    # <eos> - "end of sentence" (inserted at the end)
    vocab = {"<pad>":0, "<sos>":1, "<eos>":2,}
    idx = 3 # starting point
    for sentence in sentences:
        for word in tokenize(sentence):
            if word not in vocab:
                vocab[word] = idx
                idx+=1
    return vocab


In [6]:
# creating eng and ger vocabs
eng_vocab = build_vocab([n[0] for n in pairs])
german_vocab = build_vocab([n[1] for n in pairs])
print(eng_vocab)
print(german_vocab)

{'<pad>': 0, '<sos>': 1, '<eos>': 2, 'i': 3, 'am': 4, 'a': 5, 'student': 6, 'he': 7, 'is': 8, 'teacher': 9, 'she': 10, 'happy': 11}
{'<pad>': 0, '<sos>': 1, '<eos>': 2, 'ich': 3, 'bin': 4, 'ein': 5, 'student': 6, 'er': 7, 'ist': 8, 'lehrer': 9, 'sie': 10, 'glücklich': 11}


In [9]:
def sentence_to_indices(sentence:str, vocab:dict)->(list):
    tokens = tokenize(sentence)
    return [vocab["<sos>"]]+[vocab[n] for n in tokens]+[vocab["<eos>"]]

eng_indices = [sentence_to_indices(n[0], eng_vocab) for n in pairs]
eng_indices

[[1, 3, 4, 5, 6, 2], [1, 7, 8, 5, 9, 2], [1, 10, 8, 11, 2]]

In [None]:
def prepare_batch(pairs:list, eng_vocab:dict, german_vocab:dict):
    src_batch = [] # eng
    trg_batch = [] #german

    for eng, ger in pairs:
        src = sentence_to_indices(eng, eng_vocab)
        trg = sentence_to_indices(ger, german_vocab)

        # torch.tensor(src, dtype=torch.long) -> converting to the tensors (arrays)
        src_batch.append(torch.tensor(src, dtype=torch.long))
        trg_batch.append(torch.tensor(trg, dtype=torch.long))

    # src_batch = [
        # tensor([1, 3, 4, 2]),        # "i am happy"
        # tensor([1, 3, 4, 5, 6, 2])   # "i am a student"
        # ]
    # pad_sequence(src_batch, padding_value=0) →
        # tensor([
        # [1, 1],
        # [3, 3],
        # [4, 4],
        # [2, 5],
        # [0, 6],
        # [0, 2]
        # ])  so, the first column is the first sentence and the second is the second the zeros are the paddings, so we had vectors of the same lengths

    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=0)
    trg_batch = nn.utils.rnn.pad_sequence(trg_batch, padding_value=0)

    return src_batch, trg_batch

In [15]:
src_batch, trg_batch = prepare_batch(pairs, eng_vocab, german_vocab)
# getting the batches and send them to the device
src_batch=src_batch.to(device)
trg_batch=trg_batch.to(device)
src_batch

tensor([[ 1,  1,  1],
        [ 3,  7, 10],
        [ 4,  8,  8],
        [ 5,  5, 11],
        [ 6,  9,  2],
        [ 2,  2,  0]])

### Encoder part

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()
        # input_dim - how many words are in the dictionary (dictionary size)
        # emb_dim - size of each embedding (vector length, e.g. 32 or 100)
        # hidden_dim - how many neurons inside the LSTM, the size of the "memory"

        # Example:
        # The word "student" has an embedding index of 42
        # embedding(42) produces a vector of sorts: [0.12, -0.03, 0.44, ..., 0.08] of length emb_dim

        # Input (indices):       [1, 4, 5, 2]
        # → Embedding Layer →    [[...], [...], [...], [...]]  # (seq_len, emb_dim)
        # → LSTM Layer     →     outputs, (hidden, cell)
        self.embeding = nn.Embedding(input_dim,emb_dim) #A dictionary that turns word indices into vectors. Random at first. We will train it
        self.lstm = nn.LSTM(emb_dim, hidden_dim) #Creates an LSTM layer that will handle the embedding sequence.

    def forward(self, src):
        embedded = self.embeding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell