# LSTM

LSTMs are RNNs designed to handle sequential data and are easily adaptable for sentiment analysis. They're designed to maintain memory cells and gates to capture long-range patterns that traditional RNNs fail to do so per the vanishing gradient problem.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('sentiment140_cleaned.csv')
texts = df['clean_text'].astype(str)
labels = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size = 0.2, random_state = 734)

Let's start off with a tokenizer and look to build a vocabulary. For our embeddings, we'll load in pre-trained embeddings (GloVe). We will also look to do compute heavy tasks (training) on GPU.

In [2]:
import torch
from torchtext.vocab import GloVe, build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import torch

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# Pre-trained embeddings
dim = 300 # 100, 300
glove = GloVe(name = '6B', dim = dim)

# Tokenizer
tokenizer = get_tokenizer("basic_english")
tokenized_texts = [tokenizer(text) for text in X_train]

# Vocab
def yield_tokens(tokenized_texts):
    for tokens in tokenized_texts:
        yield tokens

vocab = build_vocab_from_iterator(
    yield_tokens(tokenized_texts),
    specials=["<unk>", "<pad>"]
)
vocab.set_default_index(vocab["<unk>"])

# Embedding matrix
embedding_matrix = torch.zeros(len(vocab), dim)
for i, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[i] = glove[token]
    else:
        embedding_matrix[i] = torch.randn(dim)  # random for out of vocab tokens

Now we have our embeddings/vocab setup so we can transform our text into tensors and define a dataset/dataloader to load our data into the model for training.

In [5]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

def text_pipeline(text, tokenizer, vocab):
    return torch.tensor([vocab[token] for token in tokenizer(text)], dtype=torch.long)

# Convert train/test into indexed tensors
X_train_indices = [text_pipeline(text, tokenizer, vocab) for text in X_train]
X_test_indices  = [text_pipeline(text, tokenizer, vocab) for text in X_test]

# Pad our sequences
X_train_padded = pad_sequence(X_train_indices, batch_first=True, padding_value=vocab["<pad>"])
X_test_padded  = pad_sequence(X_test_indices, batch_first=True, padding_value=vocab["<pad>"])

# Convert labels into tensors
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor  = torch.tensor(y_test.values, dtype=torch.long)

# Dataset
train_dataset = TensorDataset(X_train_padded, y_train_tensor)
test_dataset  = TensorDataset(X_test_padded, y_test_tensor)

# Data Loader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=64)

Now we're ready to define the LSTM classifier!

In [49]:
import torch.nn as nn

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix, pad_idx, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.embedding.weight.data.copy_(embedding_matrix)
        self.embedding.weight.requires_grad = True

        # Bidirectional LSTM with multiple layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        _, (hidden, _) = self.lstm(embedded)
        # Concat last forward + backward hidden states from top layer
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # Shape: (batch, hidden_dim * 2)
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out

In [None]:
import torch.optim as optim

epochs = 20
model = LSTMSentiment(
    vocab_size = len(vocab),
    embedding_dim = dim,
    hidden_dim = 256,
    output_dim = 2,
    embedding_matrix = embedding_matrix,
    pad_idx = vocab["<pad>"],
    dropout=0.5
)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.argmax(1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    acc = correct / total
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Acc: {acc:.4f}")

model.eval()
correct, total = 0, 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        preds = outputs.argmax(1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Test Accuracy: {correct/total:.4f}")

Epoch 1/20, Loss: 0.6895, Acc: 0.5308
Epoch 2/20, Loss: 0.6709, Acc: 0.5883
Epoch 3/20, Loss: 0.6173, Acc: 0.6627
Epoch 4/20, Loss: 0.5948, Acc: 0.6866
Epoch 5/20, Loss: 0.5684, Acc: 0.7061
Epoch 6/20, Loss: 0.5542, Acc: 0.7125
Epoch 7/20, Loss: 0.5397, Acc: 0.7221
Epoch 8/20, Loss: 0.5262, Acc: 0.7371
Epoch 9/20, Loss: 0.5189, Acc: 0.7448
Epoch 10/20, Loss: 0.5071, Acc: 0.7518
Epoch 11/20, Loss: 0.4938, Acc: 0.7601
Epoch 12/20, Loss: 0.4897, Acc: 0.7660
Epoch 13/20, Loss: 0.4848, Acc: 0.7678
Epoch 14/20, Loss: 0.4647, Acc: 0.7839
Epoch 15/20, Loss: 0.4651, Acc: 0.7775
Epoch 16/20, Loss: 0.4535, Acc: 0.7879
Epoch 17/20, Loss: 0.4492, Acc: 0.7930
Epoch 18/20, Loss: 0.4415, Acc: 0.7961
Epoch 19/20, Loss: 0.4283, Acc: 0.7997
