In [None]:
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}\n")

# Sample sentences
sentences = [
    "I love deep learning.",
    "Deep learning will change the world.",
    "PyTorch is a popular deep learning framework.",
    "Manchester United has not been the same since Sir Alex left ",
    "Why was the GPU feeling warm? It had too many layers!",
    "Transformers pay more attention than I do.",
    "You know you're a deep learning model when you have more parameters than friends."
]

print("Sample sentences:")
print(sentences)
print("\nTokenizing the sentences...")

# Tokenizer function
def tokenizer(sentence):
    return sentence.lower().split()

# Create a vocabulary from the tokenized sentences
counter = Counter()
for sentence in sentences:
    counter.update(tokenizer(sentence))

print("\nWord frequency:")
print(counter)

# Assign each word in the vocabulary a unique index
vocab = {word: idx for idx, (word, _) in enumerate(counter.items())}
vocab['<pad>'] = len(vocab)

print("\nVocabulary with index mapping:")
print(vocab)

# Convert sentences to tensor representations
data = [torch.tensor([vocab[word] for word in tokenizer(sentence)]) for sentence in sentences]

print("\nTensor representations of sentences:")
print(data)

# Create a dataset
class TextDataset(Dataset):
    def __init__(self, data):
        self.input = [torch.cat([torch.tensor([vocab['<pad>']]), item[:-1]]) for item in data]
        self.target = data

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        return self.input[idx], self.target[idx]

    def __repr__(self):
        return f"TextDataset:\n\nInput: {self.input}\n\nTarget: {self.target}"

print("\nPreparing the dataset and dataloader...")

def collate_fn(batch):
    inputs, targets = zip(*batch)
    return pad_sequence(inputs, padding_value=vocab['<pad>']), pad_sequence(targets, padding_value=vocab['<pad>'])

dataset = TextDataset(data)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

print(dataset)


Using device: cpu

Sample sentences:
['I love deep learning.', 'Deep learning will change the world.', 'PyTorch is a popular deep learning framework.', 'Manchester United has not been the same since Sir Alex left ', 'Why was the GPU feeling warm? It had too many layers!', 'Transformers pay more attention than I do.', "You know you're a deep learning model when you have more parameters than friends."]

Tokenizing the sentences...

Word frequency:
Counter({'deep': 4, 'learning': 3, 'the': 3, 'i': 2, 'a': 2, 'more': 2, 'than': 2, 'you': 2, 'love': 1, 'learning.': 1, 'will': 1, 'change': 1, 'world.': 1, 'pytorch': 1, 'is': 1, 'popular': 1, 'framework.': 1, 'manchester': 1, 'united': 1, 'has': 1, 'not': 1, 'been': 1, 'same': 1, 'since': 1, 'sir': 1, 'alex': 1, 'left': 1, 'why': 1, 'was': 1, 'gpu': 1, 'feeling': 1, 'warm?': 1, 'it': 1, 'had': 1, 'too': 1, 'many': 1, 'layers!': 1, 'transformers': 1, 'pay': 1, 'attention': 1, 'do.': 1, 'know': 1, "you're": 1, 'model': 1, 'when': 1, 'have': 1, 

In [None]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return self.fc(out)

RNNModel Class Explained

Class Declaration

    RNNModel inherits from nn.Module, which is the base class for all neural network modules in PyTorch.

Initialization Method (__init__)

    Takes three parameters:
        vocab_size: The number of unique words/tokens in your vocabulary.
        embed_size: The dimensionality of the embedding vector for each word/token.
        hidden_size: The size of the hidden state of the RNN.

Embedding Layer

    An embedding layer is created to convert input token IDs (usually integers) into dense vectors.
    This allows the model to learn meaningful representations for each word/token in the vocabulary.

RNN Layer

    A basic RNN layer is initialized.
    It takes the embedded input and processes it sequentially, producing a sequence of hidden states.
    The batch_first=True argument means that input and output tensors are expected to be in the format: (batch_size, sequence_length, feature_size).

Fully Connected Layer (Linear Layer)

    This layer is used to map the RNN's hidden state to the vocabulary size.
    It essentially helps in predicting the next word/token based on the RNN's hidden state.

Forward Method

    Defines how the input will be processed as it goes through the network.

Embedding Transformation

    The input sequences are transformed into dense vectors using the embedding layer.

RNN Transformation

    The embedded sequences are then processed by the RNN layer.
    The RNN produces a sequence of hidden states for each time step in the input.
    The final hidden state, which is not used in this model, is discarded.

Fully Connected Layer Transformation

    The RNN's output (hidden states for each time step) is passed through the fully connected layer.
    This layer produces predictions for the next word/token for each time step in the sequence.

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embed = self.embedding(x)
        output, _ = self.lstm(embed)
        return self.fc(output)

LSTMModel Class Explained

Initialization Method (__init__)

    Takes three parameters:
        vocab_size: The number of unique words/tokens in your vocabulary.
        embed_size: The dimensionality of the embedding vector for each word/token.
        hidden_size: The size of the hidden state of the LSTM.

Embedding Layer

    An embedding layer is created to convert input token IDs (usually integers) into dense vectors.
    This allows the model to learn meaningful representations for each word/token in the vocabulary.

LSTM Layer

    An LSTM (Long Short-Term Memory) layer is initialized.
    LSTM is a type of recurrent neural network (RNN) that is capable of learning and remembering over long sequences and is less susceptible to the vanishing gradient problem compared to basic RNNs.
    It takes the embedded input and processes it sequentially, producing a sequence of hidden states.

Fully Connected Layer (Linear Layer)

    This layer is used to map the LSTM's hidden state to the vocabulary size.
    It helps in predicting the next word/token based on the LSTM's hidden state.

Forward Method

    Defines how the input will be processed as it goes through the network.

Embedding Transformation

    The input sequences are transformed into dense vectors using the embedding layer.

LSTM Transformation

    The embedded sequences are then processed by the LSTM layer.
    The LSTM produces a sequence of hidden states for each time step in the input.
    The second output from the LSTM (represented by _ in the code) includes the final hidden state and cell state, but this isn't used in the model's current configuration.

Fully Connected Layer Transformation

    The LSTM's output (hidden states for each time step) is passed through the fully connected layer.
    This layer produces predictions for the next word/token for each time step in the sequence.



In [None]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        return self.fc(out)

GRUModel Class Explained

Initialization Method (__init__)

    Parameters:
        vocab_size: The number of unique words/tokens in your vocabulary.
        embed_size: The dimensionality of the embedding vector for each word/token.
        hidden_size: The size of the hidden state of the GRU.

Embedding Layer

    An embedding layer is initialized, designed to convert input token IDs (typically integers) into dense vectors.
    The layer allows the model to assign and adjust vector representations for each word/token in the vocabulary, thereby learning context and meaning.

GRU Layer

    A GRU (Gated Recurrent Unit) layer is initialized.
    The GRU is a variant of a recurrent neural network (RNN). It's designed to remember past information and is typically faster and simpler than its counterpart, the LSTM, as it uses fewer gates.
    The batch_first=True parameter means that the input tensor to the GRU should have its first dimension represent the batch size.

Fully Connected Layer (Linear Layer)

    A linear layer (or fully connected layer) is used to map the GRU's hidden state to a size equivalent to the vocabulary.
    This transformation facilitates the prediction of the next word/token based on the current hidden state from the GRU.

Forward Method

    Describes the pathway of the input as it traverses through the network.

Embedding Transformation

    The input sequences, comprised of token IDs, are mapped to dense vectors via the embedding layer.

GRU Transformation

    The transformed embedded sequences are processed by the GRU layer.
    The GRU yields a sequence of hidden states corresponding to each time step in the input.
    The second output (represented by _ in the code) is the final hidden state from the GRU, which isn't employed in the current setup of the model.

Fully Connected Layer Transformation

    The output from the GRU, which is a series of hidden states for each time step, is routed through the fully connected layer.
    This layer's output provides predictions for the probable next word/token for each sequence time step.

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)

        # Using the nn.Transformer module which handles the full transformer architecture
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x, attention_mask=None):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)  # Transformers expect (S, N, E) format, where S is the source sequence length, N is the batch size, and E is the embedding dimension
        output = self.transformer(x, x, src_key_padding_mask=attention_mask, tgt_key_padding_mask=attention_mask)  # Encoder input and Decoder input are the same for our task
        output = self.fc(output)
        return output.permute(1, 2, 0)  # Convert back to (N, C, S) format

TransformerModel Class Explained

Initialization Method (__init__)

    Parameters:
        vocab_size: Number of unique words/tokens in the vocabulary.
        embed_size: Dimensionality of the embedding vector for each word/token.
        num_heads: Number of attention heads in the multi-head attention mechanism.
        num_encoder_layers: Number of layers in the transformer encoder.
        num_decoder_layers: Number of layers in the transformer decoder.

Embedding Layer

    Embedding Layer is introduced to convert the input token IDs into dense vectors.
        This layer learns a dense representation for each word/token in the vocabulary during training.

Transformer Layer

    The main Transformer module from PyTorch is utilized, which provides the full architecture of the transformer, including both encoder and decoder.
        d_model: Refers to the depth (size) of the representation, which matches the embedding size.
        nhead: Specifies the number of heads in the multi-head attention mechanism.
        num_encoder_layers: Specifies how many encoder layers the transformer should have.
        num_decoder_layers: Specifies the number of decoder layers.

Fully Connected Layer (Linear Layer)

    A linear layer is designed to map the transformer's output to the vocabulary size, effectively allowing us to make word/token predictions based on the transformer's output.

Forward Method

    Parameters:
        x: The input sequences.
        attention_mask: Optional mask to avoid attending to specific positions.

Embedding Transformation

    The input sequence x is mapped to dense vectors using the embedding layer.

Permutation of Dimensions

    The dimensions of the input are permuted from (N, S, E) to (S, N, E) using the .permute method.
        Reason: The transformer model in PyTorch expects the input in the format (S, N, E), where S is the source sequence length, N is the batch size, and E is the embedding dimension.

Transformer Transformation

    The transformer takes in the embedded sequences for both the encoder and decoder (they are the same in this case) and produces the output sequences.
        src_key_padding_mask and tgt_key_padding_mask are optional masks to prevent the attention mechanism from focusing on specific positions (like padding tokens).

Fully Connected Layer Transformation

    The output from the transformer is passed through the fully connected layer to produce predictions for the next word/token for each sequence position.

Output Permutation

    The output dimensions are permuted back to (N, C, S) format, where N is the batch size, C is the number of channels (equal to the vocabulary size in this case), and S is the source sequence length.

In [None]:
def train_model(model, loader, optimizer, criterion, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for text, label in loader:
            optimizer.zero_grad()
            output = model(text)

            # Using reshape instead of view
            loss = criterion(output.reshape(-1, len(vocab)), label.reshape(-1))

            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch: {epoch + 1}, Loss: {total_loss / len(loader)}")

In [None]:
# Initialize and train the RNN model as an example
rnn = RNNModel(len(vocab), 64, 128)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)
train_model(rnn, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 3.8196704983711243
Epoch: 2, Loss: 3.6049699187278748
Epoch: 3, Loss: 3.316157341003418
Epoch: 4, Loss: 3.1795292496681213
Epoch: 5, Loss: 3.1583635807037354
Epoch: 6, Loss: 2.879136562347412
Epoch: 7, Loss: 2.614549160003662
Epoch: 8, Loss: 2.4817424416542053
Epoch: 9, Loss: 2.4807032346725464
Epoch: 10, Loss: 2.5066662430763245


In [None]:
# Instantiate the model
transformer = TransformerModel(len(vocab), 64, 2, 2, 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)
train_model(transformer, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 4.130754113197327
Epoch: 2, Loss: 4.117112815380096
Epoch: 3, Loss: 3.763562858104706
Epoch: 4, Loss: 3.804576098918915
Epoch: 5, Loss: 3.5726057291030884
Epoch: 6, Loss: 3.372740387916565
Epoch: 7, Loss: 3.4960185289382935
Epoch: 8, Loss: 3.3459811210632324
Epoch: 9, Loss: 3.650835692882538
Epoch: 10, Loss: 3.5030356645584106


In [None]:
lstm = LSTMModel(len(vocab), 64, 128).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)
train_model(lstm, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 3.8587138056755066
Epoch: 2, Loss: 3.7510794401168823
Epoch: 3, Loss: 3.533338189125061
Epoch: 4, Loss: 3.5360676646232605
Epoch: 5, Loss: 3.333391845226288
Epoch: 6, Loss: 3.1898970007896423
Epoch: 7, Loss: 3.0349520444869995
Epoch: 8, Loss: 3.136695981025696
Epoch: 9, Loss: 2.7177299857139587
Epoch: 10, Loss: 2.810640811920166


In [None]:
gru = GRUModel(len(vocab), 64, 128).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru.parameters(), lr=0.001)
train_model(gru, loader, optimizer, criterion, epochs=10)

Epoch: 1, Loss: 3.894600749015808
Epoch: 2, Loss: 3.750489056110382
Epoch: 3, Loss: 3.670069992542267
Epoch: 4, Loss: 3.5554893612861633
Epoch: 5, Loss: 3.4568702578544617
Epoch: 6, Loss: 3.3212966918945312
Epoch: 7, Loss: 3.3148428201675415
Epoch: 8, Loss: 3.251722037792206
Epoch: 9, Loss: 3.053061604499817
Epoch: 10, Loss: 2.8187662959098816


In [None]:
def infer_next_word(model, sentence):
    model.eval()
    tokens = tokenizer(sentence.lower())  # Convert the sentence to lowercase
    indices = [vocab[token] for token in tokens]
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(input_tensor)
    predicted_index = output[0, -1].argmax(dim=0).item()
    return list(vocab.keys())[list(vocab.values()).index(predicted_index)]


sentence = "Manchester United"
print(f"RNN prediction: {sentence} {infer_next_word(rnn, sentence)}")
print(f"LSTM prediction: {sentence} {infer_next_word(lstm, sentence)}")
print(f"GRU prediction: {sentence} {infer_next_word(gru, sentence)}")
print(f"Transformer prediction: {sentence} {infer_next_word(transformer, sentence)}")

# We cant handle unseen words :)


RNN prediction: Manchester United has
LSTM prediction: Manchester United has
GRU prediction: Manchester United has
Transformer prediction: Manchester United i
