In [1]:
import torch
import torch.nn as nn
import pandas as pd

from collections import Counter
from typing import Dict, List
import re

# Choose device

In [2]:
device: str = "cpu"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

# Dataset

In [3]:
def clean_text(text: str) -> str:
    text = re.sub(r'<[^>]+>', '', text)
        
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)    
    s = re.sub(r'\s{2,}', ' ', s)            
    text = s

    text = re.sub(r' +', ' ', text)

    text = re.sub(r'[^\x00-\x7f]', r'', text)

    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    text = emojis.sub(r'', text)
    text = re.sub(r'(.)\1+', r'\1\1', text)

    return text


In [4]:
data = pd.read_csv("../data/IMDB.csv")

sentences = data["review"].values
words = " ".join(sentences)
words = clean_text(words)
words = words.split()

In [5]:
class Tokenizer:
    def __init__(self, words: List[str], clean_text: bool = False) -> None:
        counter = Counter(words)
        self.vocab = sorted(counter, key=counter.get, reverse=True)
        self.int2word = dict(enumerate(self.vocab, 1))
        self.int2word[0] = "<PAD>"
        self.word2int = {word: id for id, word in self.int2word.items()}
        
    def __len__(self) -> int:
        return len(self.vocab)
    
    def tokenize(self, sentence: str) -> List[str]:
        return sentence.split()
    
    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
        result = []
        for word in tokens:
          if word in self.word2int:
            result.append(self.word2int[word])
        return result
    
    def convert_ids_to_tokens(self, ids: List[int] | torch.LongTensor) -> List[str]:
        if isinstance(ids, torch.LongTensor):
            return [self.int2word[id.item()] for id in ids]
        else: 
            return [self.int2word[id] for id in ids]
    
tokenizer = Tokenizer(words)
        

In [6]:
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

class IMDBDataset(Dataset):
    def __init__(self, root: str = "../data/IMDB.csv", train: bool = True, test_size: float = 0.33, tokenizer: Tokenizer = None) -> None:
        super().__init__()

        data = pd.read_csv(root)
        data = pd.get_dummies(data, columns=["sentiment"])
        data = data.rename({"review": "sentence", 
                            "sentiment_negative": "negative", 
                            "sentiment_positive": "positive"}, axis=1)
        
        X = data["sentence"].values
        y = data.drop(["sentence"], axis=1).values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

        if train:
            self.X, self.y = X_train, y_train
        else:
            self.X, self.y = X_test, y_test

        self.tokenizer: Tokenizer = tokenizer

    def __len__(self) -> int:
        return len(self.X)
    
    def __getitem__(self, idx):
        sentence = clean_text(self.X[idx])
        tokens = self.tokenizer.tokenize(sentence)
        ids = self.tokenizer.convert_tokens_to_ids(tokens)

        ids = torch.tensor(ids, dtype=torch.long)
        labels = torch.tensor(self.y[idx], dtype=torch.float32)
        return ids, labels

In [12]:
from torch.utils.data import DataLoader

train_dataset = IMDBDataset(tokenizer=tokenizer)
test_dataset = IMDBDataset(train=False, tokenizer=tokenizer)

batch_size: int = 64

train_loader = DataLoader(train_dataset)
test_loader = DataLoader(test_dataset)

# Model

In [13]:
class EmotionalLSTM(nn.Module):
    def __init__(self, 
                 vocab_size: int, 
                 emb_size: int, 
                 hidden_size: int, 
                 num_stacked_layers: int = 3, 
                 dropout: float = 0.2) -> None:
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, num_stacked_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, 2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x: torch.LongTensor) -> torch.Tensor:
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:,-1,:]
        out = self.dropout(out)
        out = self.fc(out)
        out = self.sigmoid(out)

        return out

vocab_size: int = len(tokenizer)
emb_size: int = 256
hidden_size: int = 512
num_stacked_layers: int = 2
dropout: float = 0.25

model: EmotionalLSTM = EmotionalLSTM(vocab_size, emb_size, hidden_size, num_stacked_layers, dropout).to(device)

print(model)

RuntimeError: MPS backend out of memory (MPS allocated: 2.02 GB, other allocations: 6.87 GB, max allowed: 9.07 GB). Tried to allocate 226.38 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

# Train

In [11]:
from tqdm import tqdm
import torch.optim as optim

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

train_loss_history = []
test_loss_history = []

for epoch in range(num_epochs):
    train_loss = 0.0
    test_loss = 0.0


    model.train()
    for sentences, labels in tqdm(train_loader):
        sentences, labels = sentences.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(sentences)

        # Calculate loss
        loss = criterion(outputs, labels)

        train_loss_history.append(loss.item())

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
    train_loss = loss.item()

    model.eval()
    for sentences, labels in test_loader:
        sentectes, labels = sentences.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(sentectes)

        loss = criterion(outputs, labels)
        test_loss_history.append(loss.item())

    test_loss = loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Val_loss: {test_loss:.4f}')


  0%|          | 34/33500 [00:09<2:31:17,  3.69it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 2.02 GB, other allocations: 6.89 GB, max allowed: 9.07 GB). Tried to allocate 226.38 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss_history, label="train_loss")
#plt.plot(test_loss_history, label="val_loss")
plt.xlabel("batch")
plt.ylabel("loss")
plt.legend()
plt.show()

In [None]:
accuracy = 0
total_samples = 0

loader = test_loader

model.eval()  # Set the model to evaluation mode
for batch, (X, y) in enumerate(loader):
    with torch.no_grad():
        y_pred = model(X.to("mps"))
    
    y = y.to("mps")
    _, predicted = torch.max(y_pred, 1)
    
    # Convert one-hot encoded y to class indices if needed
    if y.dim() == 2:
        y = torch.argmax(y, dim=1)
    
    # Debug prints for shapes
    correct_predictions = (predicted == y).sum().item()
    accuracy += correct_predictions
    total_samples += y.size(0)

accuracy / (len(loader) * loader.batch_size)