In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# Phrases (textual data) and their category labels (0 for sports, 1 for technology, 2 for food)
# note: this model might overfit as the data is less. feel free to use any other data source for training
# or create your own dummy data
phrases = ["great goal scored", "amazing touchdown", "new phone release", "latest laptop model", "tasty pizza", "delicious burger"]
categories = [0, 0, 1, 1, 2, 2]

# Create a vocabulary to represent words as indices
vocab = {"<PAD>": 0, "great": 1, "goal": 2, "scored": 3, "amazing": 4, "touchdown": 5, "new": 6, "phone": 7, "release": 8, "latest": 9, "laptop": 10, "model": 11, "tasty": 12, "pizza": 13, "delicious": 14, "burger": 15}

# Tokenize, encode, and pad phrases
encoded_phrases = [[vocab[word] for word in phrase.split()] for phrase in phrases]
max_length = max([len(phrase) for phrase in encoded_phrases])
padded_phrases = [phrase + [vocab["<PAD>"]] * (max_length - len(phrase)) for phrase in encoded_phrases]

# Convert phrases and categories to PyTorch tensors
inputs = torch.LongTensor(padded_phrases)
labels = torch.LongTensor(categories)

# Define LSTM model
class PhraseClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(PhraseClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        logits = self.fc(hidden.squeeze(0))
        return logits

# Instantiate model and define loss and optimizer
model = PhraseClassifier(len(vocab), embedding_dim=10, hidden_dim=20, output_dim=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model for a number of epochs
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    predictions = model(inputs.t())
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

# Test the model on new phrases
with torch.no_grad():
    test_phrases = ["incredible match", "newest gadget", "yummy cake"]
    encoded_test_phrases = [[vocab.get(word, vocab["<PAD>"]) for word in phrase.split()] for phrase in test_phrases]
    padded_test_phrases = [phrase + [vocab["<PAD>"]] * (max_length - len(phrase)) for phrase in encoded_test_phrases]
    test_inputs = torch.LongTensor(padded_test_phrases)
    test_predictions = torch.argmax(model(test_inputs.t()), dim=1)
    print("Test predictions:", test_predictions)


Epoch: 100, Loss: 0.4094054698944092
Epoch: 200, Loss: 0.03397029638290405
Epoch: 300, Loss: 0.012075490318238735
Epoch: 400, Loss: 0.006638178136199713
Epoch: 500, Loss: 0.0043108901008963585
Epoch: 600, Loss: 0.0030620284378528595
Epoch: 700, Loss: 0.0023015905171632767
Epoch: 800, Loss: 0.0017992026405408978
Epoch: 900, Loss: 0.0014477409422397614
Epoch: 1000, Loss: 0.0011911113979294896
Test predictions: tensor([2, 2, 2])


The results clearly indicate an overfitted model. We can improve the testing accuracy by getting more data. That would be an Optional TODO for you.