# 0. Environment setup

In [1]:
import numpy as np
import pandas as pd
import random
import os
import math

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix, f1_score
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader


In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7bda22f9f590>

In [3]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Using the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Using the GPU: NVIDIA GeForce GTX 1060 6GB


# 1. Load data

In [4]:
TRAIN_DATA_PATH = "dataset/processed/train.csv"
train_df = pd.read_csv(TRAIN_DATA_PATH)

print(train_df.head())
print(train_df.info())
print(train_df['Sentiment'].value_counts())

                                             Comment Sentiment
0  it’s so adorable that he says “baap” for up an...  positive
1  sir i have no words to describe your teaching ...  positive
2  the reason they said large and open space inst...   neutral
3  for ur information this is an fact that jrntr ...   neutral
4  you can really tell the progress awesome espec...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14691 entries, 0 to 14690
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    14691 non-null  object
 1   Sentiment  14691 non-null  object
dtypes: object(2)
memory usage: 229.7+ KB
None
Sentiment
positive    9121
neutral     3700
negative    1870
Name: count, dtype: int64


In [5]:
TEST_DATA_PATH = "dataset/processed/test.csv"
test_df = pd.read_csv(TEST_DATA_PATH)

print(test_df.head())
print(test_df.info())
print(test_df['Sentiment'].value_counts())

                                             Comment Sentiment
0  “oh my god guys there’s an octopus eating a cr...  negative
1  my daughter will be starting her 8th grade che...  positive
2  for some future video you should definitely bu...   neutral
3  i’m chronically ill and very frequently find i...  positive
4  the pizza planet pizza being awful is just dis...  negative
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3673 entries, 0 to 3672
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    3673 non-null   object
 1   Sentiment  3673 non-null   object
dtypes: object(2)
memory usage: 57.5+ KB
None
Sentiment
positive    2281
neutral      925
negative     467
Name: count, dtype: int64


# 2. Remove stop words

In [6]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
keep_words = {
    "not", "no", "nor",
    "don't", "didn't", "doesn't",
    "isn't", "wasn't", "aren't", "weren't",
    "can't", "couldn't", "won't", "wouldn't",
    "shouldn't", "haven't", "hasn't", "hadn't"
}

stop_words = stop_words - keep_words

def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\d+", " ", text)
    # tokenize
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

train_df['clean_comment'] = train_df['Comment'].astype(str).apply(clean_text)
test_df['clean_comment'] = test_df['Comment'].astype(str).apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yuweihuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yuweihuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 3. Prepare data

In [7]:
TEXT_COL = "clean_comment"   
LABEL_COL = "Sentiment"

X = train_df[TEXT_COL]
y = train_df[LABEL_COL]

le = LabelEncoder()
y_int = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(
    X, y_int,
    test_size=0.2,
    random_state=SEED,
    stratify=y_int
)

print("Train size:", len(X_train))
print("Val size:", len(X_val))


Train size: 11752
Val size: 2939


In [8]:
X_test = test_df[TEXT_COL]
y_test = le.transform(test_df[LABEL_COL])

print("Test size:", len(X_test))

preview = pd.DataFrame({
    "text": X_test.head().values,
    "label_int": y_test[:5],
    "label": le.inverse_transform(y_test[:5])
})
print(preview)


Test size: 3673
                                                text  label_int     label
0  “ oh god guys ’ octopus eating crab ” watches ...          0  negative
1  daughter starting th grade chem section next w...          2  positive
2  future video definitely build like huge base o...          1   neutral
3  ’ chronically ill frequently find difficult ea...          2  positive
4  pizza planet pizza awful disney sticking bit s...          0  negative


# 4. Tokenizer + padding

In [9]:
max_words = 12000

from typing import Iterable

class SimpleTokenizer:
    def __init__(self, num_words, oov_token="<OOV>"):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index = {}
        self.index_word = {}

    def fit_on_texts(self, texts: Iterable[str]):
        counter = Counter()
        for text in texts:
            counter.update(text.split())
        vocab = counter.most_common(self.num_words - 1)  # reserve 0 for padding, 1 for OOV
        self.word_index = {self.oov_token: 1}
        idx = 2
        for word, _ in vocab:
            if idx >= self.num_words:
                break
            self.word_index[word] = idx
            idx += 1
        self.index_word = {idx: word for word, idx in self.word_index.items()}

    def texts_to_sequences(self, texts: Iterable[str]):
        seqs = []
        for text in texts:
            seq = []
            for word in text.split():
                idx = self.word_index.get(word)
                if idx is None or idx >= self.num_words:
                    idx = self.word_index[self.oov_token]
                seq.append(idx)
            seqs.append(seq)
        return seqs

def pad_sequences_custom(seqs, maxlen, padding='post', truncating='post'):
    padded = np.zeros((len(seqs), maxlen), dtype=np.int64)
    for i, seq in enumerate(seqs):
        if len(seq) > maxlen:
            trunc = seq[-maxlen:] if truncating == 'pre' else seq[:maxlen]
        else:
            trunc = seq
        if padding == 'pre':
            padded[i, -len(trunc):] = trunc
        else:
            padded[i, :len(trunc)] = trunc
    return padded

def texts_to_padded(texts, tokenizer, max_len):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences_custom(seqs, maxlen=max_len, padding='post', truncating='post')


# 5. Load pretrained embeddings

In [10]:
import gensim.downloader as api

cache_path = "dataset/embeddings/glove-twitter-100.npy"
os.makedirs(os.path.dirname(cache_path), exist_ok=True)


def build_embedding_matrix(tokenizer, max_words, cache_dir="dataset/embeddings"):
    if os.path.exists(cache_path):
        embedding_matrix = np.load(cache_path)
        embedding_dim = embedding_matrix.shape[1]
        print(f"Loaded cached embeddings from {cache_path}, dim={embedding_dim}")
        return embedding_matrix.astype(np.float32)

    glove = api.load("glove-twitter-100")

    embedding_dim = glove.vector_size
    embedding_matrix = np.random.normal(scale=0.6, size=(max_words, embedding_dim)).astype(np.float32)
    embedding_matrix[0] = np.zeros(embedding_dim, dtype=np.float32)

    valid = [(w, i) for w, i in tokenizer.word_index.items() if i < max_words]
    hits = 0
    if valid:
        stoi = glove.key_to_index
        vectors = glove.vectors
        fill_indices = []
        vec_indices = []
        for w, idx in valid:
            key = w if w in stoi else w.lower() if w.lower() in stoi else None
            if key is None:
                continue
            fill_indices.append(idx)
            vec_indices.append(stoi[key])
        if fill_indices:
            embedding_matrix[fill_indices] = vectors[vec_indices]
            hits = len(fill_indices)
    np.save(cache_path, embedding_matrix)
    print(f"GloVe hits: {hits}/{len(tokenizer.word_index)}")
    print(f"Saved embeddings to {cache_path}")
    return embedding_matrix.astype(np.float32)


# 6. Initialize model, loss, dataloader, optimizer

## Model with pretrained embeddings frozen

In [11]:
num_classes = len(le.classes_)

class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, dense_dim, num_classes, dropout):
        super().__init__()
        vocab_size, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            padding_idx=0,
            freeze=True,  # set to False to fine-tune embeddings
        )
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_dim * 2, dense_dim)
        self.fc2 = nn.Linear(dense_dim, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h, _) = self.lstm(emb)
        h_cat = torch.cat((h[-2], h[-1]), dim=1)  # concat both directions
        x = torch.relu(self.fc1(h_cat))
        x = self.dropout(x)
        return self.fc2(x)


## Loss with class weights

In [12]:
classes = np.unique(y_int)

class_weights_arr = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_int
)

class_weights = dict(zip(classes, class_weights_arr))
print("Class weights:", class_weights)

class_weights_tensor = torch.tensor(
    [class_weights.get(i, 1.0) for i in range(num_classes)],
    dtype=torch.float32,
    device=device
)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)


Class weights: {0: 2.618716577540107, 1: 1.3235135135135134, 2: 0.5368928845521325}


## Build dataLoader

In [13]:
batch_size = 128

def build_loaders(X_pad, y_labels, train_idx, val_idx, batch_size):
    train_ds = TensorDataset(
        torch.tensor(X_pad[train_idx], dtype=torch.long),
        torch.tensor(y_labels[train_idx], dtype=torch.long)
    )
    val_ds = TensorDataset(
        torch.tensor(X_pad[val_idx], dtype=torch.long),
        torch.tensor(y_labels[val_idx], dtype=torch.long)
    )
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=batch_size)
    return train_dl, val_dl


def train_one_epoch(model, dataloader, optimizer, criterion, scheduler=None):
    model.train()
    running_loss = 0.0
    for xb, yb in dataloader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        running_loss += loss.item() * xb.size(0)
    return running_loss / len(dataloader.dataset)


def eval_model(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for xb, yb in dataloader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = criterion(out, yb)
            running_loss += loss.item() * xb.size(0)
            pred = out.argmax(dim=1)
            correct += (pred == yb).sum().item()
            total += yb.size(0)
            y_true.append(yb.cpu())
            y_pred.append(pred.cpu())
    avg_loss = running_loss / len(dataloader.dataset)
    acc = correct / total if total else 0.0
    if y_true and y_pred:
        y_true_cat = torch.cat(y_true).numpy()
        y_pred_cat = torch.cat(y_pred).numpy()
        macro_f1 = f1_score(y_true_cat, y_pred_cat, average="macro")
    else:
        macro_f1 = 0.0
    return avg_loss, acc, macro_f1


def build_cosine_warmup_scheduler(optimizer, total_steps, warmup_ratio=0.1):
    warmup_steps = max(1, int(warmup_ratio * total_steps))

    def lr_lambda(step):
        if step < warmup_steps:
            return float(step + 1) / float(warmup_steps)
        progress = (step - warmup_steps) / max(1, total_steps - warmup_steps)
        return 0.5 * (1 + math.cos(math.pi * progress))

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


# 7. Random search (macro F1)


In [14]:
from sklearn.model_selection import ParameterSampler, StratifiedKFold

param_dist = {
    "max_len": [75, 100, 150],
    "lstm_units": [32, 64, 96],
    "dense_units": [32, 64, 96],
    "dropout": [0.3, 0.4, 0.5],
    "lr": [3e-3, 2e-3, 1e-3]
}

search_epochs = 10
n_iter_search = 5
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)

X_all = train_df[TEXT_COL].astype(str).values
y_all = y_int

best_params = None
best_score = -float("inf")
trial_history = []

for trial_idx, params in enumerate(ParameterSampler(param_dist, n_iter=n_iter_search, random_state=SEED), 1):
    print(f"=== Trial {trial_idx}/{n_iter_search}: {params} ===")
    tokenizer = SimpleTokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_all)
    X_pad = texts_to_padded(X_all, tokenizer, params["max_len"])
    embedding_matrix = build_embedding_matrix(tokenizer, max_words)

    fold_f1s = []
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_pad, y_all), 1):
        train_dl, val_dl = build_loaders(X_pad, y_all, train_idx, val_idx, batch_size)
        model = BiLSTMClassifier(
            embedding_matrix,
            params["lstm_units"],
            params["dense_units"],
            num_classes,
            params["dropout"]
        ).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])
        total_steps = search_epochs * len(train_dl)
        scheduler = build_cosine_warmup_scheduler(optimizer, total_steps)

        best_fold_f1 = 0.0
        for epoch in range(search_epochs):
            train_loss = train_one_epoch(model, train_dl, optimizer, criterion, scheduler)
            val_loss, val_acc, val_f1 = eval_model(model, val_dl, criterion)
            best_fold_f1 = max(best_fold_f1, val_f1)
            #print(f"Fold {fold_idx} Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}, val_macro_f1={val_f1:.4f}")
        fold_f1s.append(best_fold_f1)

    mean_f1 = float(np.mean(fold_f1s)) if fold_f1s else 0.0
    trial_history.append({"params": params, "mean_val_macro_f1": mean_f1})
    print(f"Trial {trial_idx} mean val macro F1: {mean_f1:.4f}")

    if mean_f1 > best_score:
        best_score = mean_f1
        best_params = params

print("\nBest params:", best_params)
print(f"Best CV macro F1: {best_score:.4f}")


=== Trial 1/5: {'max_len': 75, 'lstm_units': 96, 'lr': 0.001, 'dropout': 0.3, 'dense_units': 32} ===
Loaded cached embeddings from dataset/embeddings/glove-twitter-100.npy, dim=100
Trial 1 mean val macro F1: 0.6850
=== Trial 2/5: {'max_len': 75, 'lstm_units': 96, 'lr': 0.003, 'dropout': 0.3, 'dense_units': 32} ===
Loaded cached embeddings from dataset/embeddings/glove-twitter-100.npy, dim=100
Trial 2 mean val macro F1: 0.6971
=== Trial 3/5: {'max_len': 150, 'lstm_units': 96, 'lr': 0.002, 'dropout': 0.5, 'dense_units': 64} ===
Loaded cached embeddings from dataset/embeddings/glove-twitter-100.npy, dim=100
Trial 3 mean val macro F1: 0.6926
=== Trial 4/5: {'max_len': 100, 'lstm_units': 96, 'lr': 0.002, 'dropout': 0.5, 'dense_units': 96} ===
Loaded cached embeddings from dataset/embeddings/glove-twitter-100.npy, dim=100
Trial 4 mean val macro F1: 0.6923
=== Trial 5/5: {'max_len': 100, 'lstm_units': 64, 'lr': 0.001, 'dropout': 0.5, 'dense_units': 96} ===
Loaded cached embeddings from datase

# 8. Train final model with best params


In [15]:
assert best_params is not None, "Run the random search cell first to set best_params"

best_max_len = best_params["max_len"]
best_lstm_units = best_params["lstm_units"]
best_dense_units = best_params["dense_units"]
best_dropout = best_params["dropout"]
best_lr = best_params["lr"]

# Fit tokenizer on full training data to align with cached embeddings
best_tokenizer = SimpleTokenizer(num_words=max_words, oov_token="<OOV>")
best_tokenizer.fit_on_texts(train_df[TEXT_COL].astype(str).values)

# Pad splits using the same tokenizer
X_train_pad = texts_to_padded(X_train.astype(str).values, best_tokenizer, best_max_len)
X_val_pad = texts_to_padded(X_val.astype(str).values, best_tokenizer, best_max_len)
X_test_pad = texts_to_padded(test_df[TEXT_COL].astype(str).values, best_tokenizer, best_max_len)

# ensure label tensors are numeric arrays
_y_train = np.array(y_train, dtype=np.int64)
_y_val = np.array(y_val, dtype=np.int64)

embedding_matrix = build_embedding_matrix(best_tokenizer, max_words)

train_ds = TensorDataset(
    torch.tensor(X_train_pad, dtype=torch.long),
    torch.tensor(_y_train, dtype=torch.long)
)
val_ds = TensorDataset(
    torch.tensor(X_val_pad, dtype=torch.long),
    torch.tensor(_y_val, dtype=torch.long)
)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size)

test_ds = TensorDataset(torch.tensor(X_test_pad, dtype=torch.long))
test_dl = DataLoader(test_ds, batch_size=256)

model = BiLSTMClassifier(
    embedding_matrix,
    best_lstm_units,
    best_dense_units,
    num_classes,
    best_dropout
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=best_lr)
num_epochs_final = 10
total_steps_final = num_epochs_final * len(train_dl)
scheduler = build_cosine_warmup_scheduler(optimizer, total_steps_final)

MODEL_PATH = "outputs/lstm/best_lstm.pt"
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

best_val_f1 = -float("inf")
best_epoch = 0

for epoch in range(num_epochs_final):
    train_loss = train_one_epoch(model, train_dl, optimizer, criterion, scheduler)
    val_loss, val_acc, val_f1 = eval_model(model, val_dl, criterion)
    print(f"Final Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}, val_macro_f1={val_f1:.4f}")
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_epoch = epoch + 1
        torch.save(model.state_dict(), MODEL_PATH)

print(f"Best val macro F1: {best_val_f1:.4f} at epoch {best_epoch}")
print("Saved best model to", MODEL_PATH)


Loaded cached embeddings from dataset/embeddings/glove-twitter-100.npy, dim=100
Final Epoch 1: train_loss=0.9360, val_loss=0.7517, val_acc=0.6798, val_macro_f1=0.6100
Final Epoch 2: train_loss=0.7312, val_loss=0.7273, val_acc=0.7322, val_macro_f1=0.6515
Final Epoch 3: train_loss=0.6540, val_loss=0.6709, val_acc=0.6999, val_macro_f1=0.6466
Final Epoch 4: train_loss=0.6018, val_loss=0.6859, val_acc=0.7387, val_macro_f1=0.6753
Final Epoch 5: train_loss=0.5551, val_loss=0.6618, val_acc=0.7390, val_macro_f1=0.6771
Final Epoch 6: train_loss=0.5114, val_loss=0.6668, val_acc=0.7397, val_macro_f1=0.6791
Final Epoch 7: train_loss=0.4657, val_loss=0.7039, val_acc=0.7326, val_macro_f1=0.6710
Final Epoch 8: train_loss=0.4218, val_loss=0.7098, val_acc=0.7506, val_macro_f1=0.6871
Final Epoch 9: train_loss=0.3907, val_loss=0.7582, val_acc=0.7550, val_macro_f1=0.6911
Final Epoch 10: train_loss=0.3762, val_loss=0.7520, val_acc=0.7526, val_macro_f1=0.6879
Best val macro F1: 0.6911 at epoch 9
Saved best m

In [16]:
from torchinfo import summary

summary(
    model,
    input_size=(1, best_max_len),      # batch, seq_len
    dtypes=[torch.long],
    col_names=("input_size", "output_size", "num_params"),
)

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
BiLSTMClassifier                         [1, 75]                   [1, 3]                    --
├─Embedding: 1-1                         [1, 75]                   [1, 75, 100]              (1,200,000)
├─LSTM: 1-2                              [1, 75, 100]              [1, 75, 192]              152,064
├─Linear: 1-3                            [1, 192]                  [1, 32]                   6,176
├─Dropout: 1-4                           [1, 32]                   [1, 32]                   --
├─Linear: 1-5                            [1, 32]                   [1, 3]                    99
Total params: 1,358,339
Trainable params: 158,339
Non-trainable params: 1,200,000
Total mult-adds (Units.MEGABYTES): 12.61
Input size (MB): 0.00
Forward/backward pass size (MB): 0.18
Params size (MB): 5.43
Estimated Total Size (MB): 5.61

# 8. Test set results and output

In [17]:
model = BiLSTMClassifier(
    embedding_matrix,
    best_lstm_units,
    best_dense_units,
    num_classes,
    best_dropout
).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()


BiLSTMClassifier(
  (embedding): Embedding(12000, 100, padding_idx=0)
  (lstm): LSTM(100, 96, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=192, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=3, bias=True)
)

In [18]:
probas = []
with torch.no_grad():
    for (xb,) in test_dl:
        xb = xb.to(device)
        out = model(xb)
        probas.append(torch.softmax(out, dim=1).cpu())

y_test_proba = torch.cat(probas, dim=0).numpy()
y_test_pred  = np.argmax(y_test_proba, axis=1)

In [19]:
acc = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {acc:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4, target_names=le.classes_))

print("\nOne-vs-Rest AUC:")
for idx, cls_name in enumerate(le.classes_):
    y_true_bin = (y_test == idx).astype(int)
    auc = roc_auc_score(y_true_bin, y_test_proba[:, idx])
    print(f"AUC for {cls_name} ({idx}): {auc:.4f}")

cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (rows = true, cols = predicted):")
print(cm)


Accuracy: 0.7493

Classification Report:
              precision    recall  f1-score   support

    negative     0.4967    0.6403    0.5594       467
     neutral     0.5822    0.6854    0.6296       925
    positive     0.9178    0.7975    0.8534      2281

    accuracy                         0.7493      3673
   macro avg     0.6655    0.7077    0.6808      3673
weighted avg     0.7797    0.7493    0.7597      3673


One-vs-Rest AUC:
AUC for negative (0): 0.8918
AUC for neutral (1): 0.8630
AUC for positive (2): 0.9134

Confusion Matrix (rows = true, cols = predicted):
[[ 299  115   53]
 [ 181  634  110]
 [ 122  340 1819]]


In [20]:
pred_df = pd.DataFrame({
    "id": X_test.index,
    "comment": test_df.loc[X_test.index, "Comment"].values,
    "true_label": y_test.astype(int),
    "pred_label": y_test_pred.astype(int),
})

label_to_id = {cls: idx for idx, cls in enumerate(le.classes_)}
for idx, cls in enumerate(le.classes_):
    pred_df[f"prob_{label_to_id[cls]}"] = y_test_proba[:, idx]


In [21]:
OUTPUT_CSV = f"outputs/lstm/test_predictions_rs{SEED}.csv"
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
pred_df.to_csv(OUTPUT_CSV, index=False)
print("Saved to", OUTPUT_CSV)


Saved to outputs/lstm/test_predictions_rs42.csv
