<a href="https://colab.research.google.com/github/thesidsat/NLPNeuralArchitectureOverview/blob/main/NLP_Architectures_Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}\n")

# Sample sentences
sentences = [
    "I love deep learning.",
    "Deep learning will change the world.",
    "PyTorch is a popular deep learning framework.",
    "Manchester United has not been the same since Sir Alex left ",
    "Why was the GPU feeling warm? It had too many layers!",
    "Transformers pay more attention than I do.",
    "You know you're a deep learning model when you have more parameters than friends."
]

print("Sample sentences:")
print(sentences)
print("\nTokenizing the sentences...")

# Tokenizer function
def tokenizer(sentence):
    return sentence.lower().split()

# Create a vocabulary from the tokenized sentences
counter = Counter()
for sentence in sentences:
    counter.update(tokenizer(sentence))

print("\nWord frequency:")
print(counter)

# Assign each word in the vocabulary a unique index
vocab = {word: idx for idx, (word, _) in enumerate(counter.items())}
vocab['<pad>'] = len(vocab)

print("\nVocabulary with index mapping:")
print(vocab)

# Convert sentences to tensor representations
data = [torch.tensor([vocab[word] for word in tokenizer(sentence)]) for sentence in sentences]

print("\nTensor representations of sentences:")
print(data)

# Create a dataset
class TextDataset(Dataset):
    def __init__(self, data):
        self.input = [torch.cat([torch.tensor([vocab['<pad>']]), item[:-1]]) for item in data]
        self.target = data

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        return self.input[idx], self.target[idx]

    def __repr__(self):
        return f"TextDataset:\n\nInput: {self.input}\n\nTarget: {self.target}"

print("\nPreparing the dataset and dataloader...")

def collate_fn(batch):
    inputs, targets = zip(*batch)
    return pad_sequence(inputs, padding_value=vocab['<pad>']), pad_sequence(targets, padding_value=vocab['<pad>'])

dataset = TextDataset(data)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

print(dataset)


Using device: cpu

Sample sentences:
['I love deep learning.', 'Deep learning will change the world.', 'PyTorch is a popular deep learning framework.', 'Manchester United has not been the same since Sir Alex left ', 'Why was the GPU feeling warm? It had too many layers!', 'Transformers pay more attention than I do.', "You know you're a deep learning model when you have more parameters than friends."]

Tokenizing the sentences...

Word frequency:
Counter({'deep': 4, 'learning': 3, 'the': 3, 'i': 2, 'a': 2, 'more': 2, 'than': 2, 'you': 2, 'love': 1, 'learning.': 1, 'will': 1, 'change': 1, 'world.': 1, 'pytorch': 1, 'is': 1, 'popular': 1, 'framework.': 1, 'manchester': 1, 'united': 1, 'has': 1, 'not': 1, 'been': 1, 'same': 1, 'since': 1, 'sir': 1, 'alex': 1, 'left': 1, 'why': 1, 'was': 1, 'gpu': 1, 'feeling': 1, 'warm?': 1, 'it': 1, 'had': 1, 'too': 1, 'many': 1, 'layers!': 1, 'transformers': 1, 'pay': 1, 'attention': 1, 'do.': 1, 'know': 1, "you're": 1, 'model': 1, 'when': 1, 'have': 1, 

In [33]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return self.fc(out)

In [34]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embed = self.embedding(x)
        output, _ = self.lstm(embed)
        return self.fc(output)

In [35]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        return self.fc(out)

In [36]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)

        # Using the nn.Transformer module which handles the full transformer architecture
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x, attention_mask=None):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)  # Transformers expect (S, N, E) format, where S is the source sequence length, N is the batch size, and E is the embedding dimension
        output = self.transformer(x, x, src_key_padding_mask=attention_mask, tgt_key_padding_mask=attention_mask)  # Encoder input and Decoder input are the same for our task
        output = self.fc(output)
        return output.permute(1, 2, 0)  # Convert back to (N, C, S) format

In [37]:
def train_model(model, loader, optimizer, criterion, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for text, label in loader:
            optimizer.zero_grad()
            output = model(text)

            # Using reshape instead of view
            loss = criterion(output.reshape(-1, len(vocab)), label.reshape(-1))

            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch: {epoch + 1}, Loss: {total_loss / len(loader)}")

In [38]:
# Initialize and train the RNN model as an example
rnn = RNNModel(len(vocab), 64, 128)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)
train_model(rnn, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 3.8196704983711243
Epoch: 2, Loss: 3.6049699187278748
Epoch: 3, Loss: 3.316157341003418
Epoch: 4, Loss: 3.1795292496681213
Epoch: 5, Loss: 3.1583635807037354
Epoch: 6, Loss: 2.879136562347412
Epoch: 7, Loss: 2.614549160003662
Epoch: 8, Loss: 2.4817424416542053
Epoch: 9, Loss: 2.4807032346725464
Epoch: 10, Loss: 2.5066662430763245


In [39]:
# Instantiate the model
transformer = TransformerModel(len(vocab), 64, 2, 2, 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)
train_model(transformer, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 4.130754113197327
Epoch: 2, Loss: 4.117112815380096
Epoch: 3, Loss: 3.763562858104706
Epoch: 4, Loss: 3.804576098918915
Epoch: 5, Loss: 3.5726057291030884
Epoch: 6, Loss: 3.372740387916565
Epoch: 7, Loss: 3.4960185289382935
Epoch: 8, Loss: 3.3459811210632324
Epoch: 9, Loss: 3.650835692882538
Epoch: 10, Loss: 3.5030356645584106


In [40]:
lstm = LSTMModel(len(vocab), 64, 128).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)
train_model(lstm, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 3.8587138056755066
Epoch: 2, Loss: 3.7510794401168823
Epoch: 3, Loss: 3.533338189125061
Epoch: 4, Loss: 3.5360676646232605
Epoch: 5, Loss: 3.333391845226288
Epoch: 6, Loss: 3.1898970007896423
Epoch: 7, Loss: 3.0349520444869995
Epoch: 8, Loss: 3.136695981025696
Epoch: 9, Loss: 2.7177299857139587
Epoch: 10, Loss: 2.810640811920166


In [41]:
gru = GRUModel(len(vocab), 64, 128).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru.parameters(), lr=0.001)
train_model(gru, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 3.894600749015808
Epoch: 2, Loss: 3.750489056110382
Epoch: 3, Loss: 3.670069992542267
Epoch: 4, Loss: 3.5554893612861633
Epoch: 5, Loss: 3.4568702578544617
Epoch: 6, Loss: 3.3212966918945312
Epoch: 7, Loss: 3.3148428201675415
Epoch: 8, Loss: 3.251722037792206
Epoch: 9, Loss: 3.053061604499817
Epoch: 10, Loss: 2.8187662959098816


In [43]:
def infer_next_word(model, sentence):
    model.eval()
    tokens = tokenizer(sentence.lower())  # Convert the sentence to lowercase
    indices = [vocab[token] for token in tokens]
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(input_tensor)
    predicted_index = output[0, -1].argmax(dim=0).item()
    return list(vocab.keys())[list(vocab.values()).index(predicted_index)]


sentence = "Manchester United"
print(f"RNN prediction: {sentence} {infer_next_word(rnn, sentence)}")
print(f"LSTM prediction: {sentence} {infer_next_word(lstm, sentence)}")
print(f"GRU prediction: {sentence} {infer_next_word(gru, sentence)}")
print(f"Transformer prediction: {sentence} {infer_next_word(transformer, sentence)}")

# We cant handle unseen words :)


RNN prediction: Manchester United has
LSTM prediction: Manchester United has
GRU prediction: Manchester United has
Transformer prediction: Manchester United i
