In [18]:
!pip install torchtext datasets pandas matplotlib nltk




In [2]:
import re, json, random, math, time
from collections import Counter
from pathlib import Path

import numpy as np, pandas as pd
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

DATASET_ROOT = "hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/"
SPLITS       = dict(train="train_df.csv",
                    validation="val_df.csv",
                    test="test_df.csv")

VOCAB_SIZE   = 20_000          # incl. PAD+UNK
MAX_LEN      = 200
EMBED_DIM    = 128
HIDDEN_DIM   = 64
NUM_CLASSES  = 3
LR           = 3e-3
EPOCHS       = 10
BATCH_SIZE   = 128
DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"

ROOT_DIR     = Path.cwd()            # project root
MODEL_PT     = ROOT_DIR / "sentiment_bilstm_best.pt"
VOCAB_JSON   = ROOT_DIR / "vocab.json"   # handy for later re‑use


In [8]:
def clean_text(text:str) -> str:
    if pd.isna(text): return ""
    text = re.sub(r'<.*?>',  '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower().strip()

dfs = {name: pd.read_csv(DATASET_ROOT + fn) for name,fn in SPLITS.items()}
for df in dfs.values():
    df["text"] = df["text"].fillna("").apply(clean_text)


In [19]:
PAD, UNK = "<PAD>", "<UNK>"

# Skip stop words
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

STOP_WORDS = set(stopwords.words("english"))

counter = Counter()
for text in dfs["train"]["text"]:
    tokens = text.split()
    filtered = [w for w in tokens if w not in STOP_WORDS]
    counter.update(filtered)

most_common = [w for w,_ in counter.most_common(VOCAB_SIZE-2)]
itos = [PAD, UNK] + most_common
stoi = {tok:i for i,tok in enumerate(itos)}

print(most_common[:10])


with open(VOCAB_JSON, "w") as f: json.dump(itos, f)

PAD_IDX = stoi[PAD]; UNK_IDX = stoi[UNK]

def encode_pad(text:str, max_len:int=MAX_LEN):
    seq = [stoi.get(tok, UNK_IDX) for tok in text.split()]
    seq = seq[:max_len] + [PAD_IDX]*(max_len-len(seq))
    return torch.tensor(seq, dtype=torch.long)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tomal66/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['app', 'im', 'day', 'like', 'good', 'time', 'get', 'love', 'dont', 'one']


In [20]:
class SentDataset(Dataset):
    def __init__(self, df):
        self.seqs  = [encode_pad(t) for t in df["text"]]
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return self.seqs[idx], self.labels[idx]

train_ds = SentDataset(dfs["train"]);  val_ds = SentDataset(dfs["validation"])
test_ds  = SentDataset(dfs["test"])

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE)
test_dl  = DataLoader(test_ds,  batch_size=BATCH_SIZE)

print(dfs["train"].head())


      id                                               text  label sentiment
0   9536                     cooking microwave pizzas yummy      2  positive
1   6135  any plans of allowing sub tasks to show up in ...      1   neutral
2  17697  i love the humor i just reworded it like sayin...      2  positive
3  14182                       naw idk what ur talkin about      1   neutral
4  17840           that sucks to hear i hate days like that      0  negative


In [4]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_sz, embed_dim, hidden_dim, num_cls, pad_idx):
        super().__init__()
        self.emb  = nn.Embedding(vocab_sz, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                            bidirectional=True, batch_first=True)
        self.drop1 = nn.Dropout(0.5)

        self.fc1  = nn.Linear(hidden_dim*2, 32)
        self.bn   = nn.BatchNorm1d(32)
        self.drop2 = nn.Dropout(0.6)
        self.out  = nn.Linear(32, num_cls)

    def forward(self, x):
        x = self.emb(x)
        _, (h_n, _) = self.lstm(x)
        h = torch.cat((h_n[0], h_n[1]), dim=1)
        h = self.drop1(h)
        h = F.relu(self.fc1(h))
        h = self.bn(h)
        h = self.drop2(h)
        return self.out(h)

model = BiLSTMClassifier(len(itos), EMBED_DIM,
                         HIDDEN_DIM, NUM_CLASSES, PAD_IDX).to(DEVICE)


In [24]:
opt  = torch.optim.Adam(model.parameters(), lr=LR)
crit = nn.CrossEntropyLoss()
best_acc = 0

for epoch in range(1, EPOCHS+1):
    model.train();   running = 0
    for X,y in train_dl:
        X,y = X.to(DEVICE), y.to(DEVICE)
        opt.zero_grad() # Set gradients to zero
        loss = crit(model(X), y)
        loss.backward(); opt.step() # Gradient Descent
        running += loss.item()*len(y)

    # validation
    model.eval();  correct=total=0
    with torch.no_grad():
        for X,y in val_dl:
            X,y = X.to(DEVICE), y.to(DEVICE)
            pred = model(X).argmax(1)
            correct += (pred==y).sum().item()
            total   += len(y)
    val_acc = correct/total
    print(f"Epoch {epoch:2d} | train loss {running/len(train_ds):.4f}"
          f" | val acc {val_acc:.3%}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({"state_dict": model.state_dict(),
                    "itos": itos}, MODEL_PT)


Epoch  1 | train loss 1.0538 | val acc 55.716%
Epoch  2 | train loss 0.8733 | val acc 63.305%
Epoch  3 | train loss 0.7486 | val acc 65.668%
Epoch  4 | train loss 0.6595 | val acc 65.648%
Epoch  5 | train loss 0.5844 | val acc 66.148%
Epoch  6 | train loss 0.5171 | val acc 65.187%
Epoch  7 | train loss 0.4692 | val acc 65.360%
Epoch  8 | train loss 0.4160 | val acc 64.380%
Epoch  9 | train loss 0.3749 | val acc 64.784%
Epoch 10 | train loss 0.3425 | val acc 64.592%


In [23]:
INDEX2LABEL = {0:"negative", 1:"neutral", 2:"positive"}

def predict(texts):
    model.eval()
    X = torch.stack([encode_pad(clean_text(t)) for t in texts]).to(DEVICE)
    with torch.no_grad():
        preds = model(X).argmax(1).cpu().numpy()
    return [INDEX2LABEL[i] for i in preds]

samples = ["Utterly disappointing; the acting felt wooden.",
           "Let's play a game of chess instead.",
           "Can't wait to see the new movie!"]
print(list(zip(samples, predict(samples))))


# Visualize NN model on input

[('Utterly disappointing; the acting felt wooden.', 'negative'), ("Let's play a game of chess instead.", 'negative'), ("Can't wait to see the new movie!", 'negative')]


In [6]:
ckpt   = torch.load(MODEL_PT, map_location="cpu")
itos   = ckpt["itos"]
stoi   = {tok:i for i,tok in enumerate(itos)}
PAD_IDX, UNK_IDX = stoi["<PAD>"], stoi["<UNK>"]

def encode_pad_reload(text, max_len=MAX_LEN):
    seq = [stoi.get(tok, UNK_IDX) for tok in clean_text(text).split()]
    seq = seq[:max_len] + [PAD_IDX]*(max_len-len(seq))
    return torch.tensor(seq)

reloaded = BiLSTMClassifier(len(itos), EMBED_DIM,
                            HIDDEN_DIM, NUM_CLASSES, PAD_IDX)
reloaded.load_state_dict(ckpt["state_dict"])
reloaded.eval()


BiLSTMClassifier(
  (emb): Embedding(20000, 128, padding_idx=0)
  (lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
  (drop1): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=128, out_features=32, bias=True)
  (bn): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop2): Dropout(p=0.6, inplace=False)
  (out): Linear(in_features=32, out_features=3, bias=True)
)

In [None]:
# Sanity check on reloaded model

# Extremely negative
text = "This is the worst movie I have ever seen."
X = encode_pad_reload(text).unsqueeze(0)
X = X.to(DEVICE)
with torch.no_grad():
    pred = reloaded(X).argmax(1).cpu().numpy()
print(f"Prediction: {INDEX2LABEL[pred[0]]} (reloaded model)")


Prediction: negative (reloaded model)


In [None]:
!pip install torch torchvision torchviz graphviz

In [22]:
from torchviz import make_dot

# 1) Pick a sample text and encode it
sample_text = "I absolutely loved the cinematography but the plot was dull."
indices = encode_pad_reload(sample_text).unsqueeze(0).to(DEVICE)  # shape (1, MAX_LEN)

# 2) Run a forward pass and capture the output
model.eval()
out = model(indices)  # shape (1, NUM_CLASSES)

# 3) Build the graph
#    params=dict(model.named_parameters()) will link the parameter tensors into the graph
dot = make_dot(out, params=dict(model.named_parameters()))

# 4) Render to a file (will produce `bilstm_graph.png` and `bilstm_graph`)
dot.format = 'png'
dot.render('bilstm_graph')


'bilstm_graph.png'