In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Dummy toy dataset
tokens = ["i", "like", "deep", "learning", "and", "nlp", "is", "fun"]
word_to_idx = {w: i for i, w in enumerate(tokens)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

data = [
    ["i", "like", "deep", "learning"],
    ["deep", "learning", "and", "nlp"],
    ["nlp", "is", "fun", "i"]
]

X = []
y = []

for seq in data:
    X.append([word_to_idx[w] for w in seq[:3]])
    y.append(word_to_idx[seq[3]])

X = torch.tensor(X)
y = torch.tensor(y)


Step 2: GRU Model

In [None]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        return self.fc(out[:, -1, :])


Step 3: LSTM Model

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])


 Step 4: Training Function

In [None]:
def train(model, X, y, epochs=300):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(epochs):
        model.train()
        out = model(X)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if epoch % 100 == 0:
            print(f"Epoch {epoch}/{epochs}, Loss: {loss.item():.4f}")


Step 5: Run Experiments

In [None]:
vocab_size = len(tokens)
emb_size = 10
hidden_size = 16

print("\n--- Training GRU ---")
gru_model = GRUModel(vocab_size, emb_size, hidden_size)
train(gru_model, X, y)

print("\n--- Training LSTM ---")
lstm_model = LSTMModel(vocab_size, emb_size, hidden_size)
train(lstm_model, X, y)


Final: Compare Predictions

In [None]:
def predict_next_word(model, seed_words):
    model.eval()
    input_seq = torch.tensor([[word_to_idx[w] for w in seed_words]])
    with torch.no_grad():
        out = model(input_seq)
        pred_idx = torch.argmax(out, dim=1).item()
    return idx_to_word[pred_idx]

print("\nGRU Prediction:", predict_next_word(gru_model, ["i", "like", "deep"]))
print("LSTM Prediction:", predict_next_word(lstm_model, ["i", "like", "deep"]))


GRU converges faster (fewer parameters)
LSTM may perform better on longer contexts
For small tasks, both work similarly