In [1]:
import numpy as np
import pandas as pd
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torchinfo import summary

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x77a98a38f430>

In [3]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Using the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Using the GPU: NVIDIA GeForce GTX 1060 6GB


In [4]:
TRAIN_DATA_PATH = "dataset/processed/train.csv"
train_df = pd.read_csv(TRAIN_DATA_PATH)

print(train_df.head())
print(train_df.info())
print(train_df['Sentiment'].value_counts())

                                             Comment Sentiment
0  it’s so adorable that he says “baap” for up an...  positive
1  sir i have no words to describe your teaching ...  positive
2  the reason they said large and open space inst...   neutral
3  for ur information this is an fact that jrntr ...   neutral
4  you can really tell the progress awesome espec...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14691 entries, 0 to 14690
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    14691 non-null  object
 1   Sentiment  14691 non-null  object
dtypes: object(2)
memory usage: 229.7+ KB
None
Sentiment
positive    9121
neutral     3700
negative    1870
Name: count, dtype: int64


In [5]:
TEST_DATA_PATH = "dataset/processed/test.csv"
test_df = pd.read_csv(TEST_DATA_PATH)

print(test_df.head())
print(test_df.info())
print(test_df['Sentiment'].value_counts())

                                             Comment Sentiment
0  “oh my god guys there’s an octopus eating a cr...  negative
1  my daughter will be starting her 8th grade che...  positive
2  for some future video you should definitely bu...   neutral
3  i’m chronically ill and very frequently find i...  positive
4  the pizza planet pizza being awful is just dis...  negative
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3673 entries, 0 to 3672
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    3673 non-null   object
 1   Sentiment  3673 non-null   object
dtypes: object(2)
memory usage: 57.5+ KB
None
Sentiment
positive    2281
neutral      925
negative     467
Name: count, dtype: int64


In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')


stop_words = set(stopwords.words('english'))


keep_words = {
    "not", "no", "nor",
    "don't", "didn't", "doesn't",
    "isn't", "wasn't", "aren't", "weren't",
    "can't", "couldn't", "won't", "wouldn't",
    "shouldn't", "haven't", "hasn't", "hadn't"
}


stop_words = stop_words - keep_words


def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\d+", " ", text)
    # tokenize
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)


train_df['clean_comment'] = train_df['Comment'].astype(str).apply(clean_text)
test_df['clean_comment'] = test_df['Comment'].astype(str).apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yuweihuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yuweihuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 切 train/val

In [7]:
TEXT_COL = "clean_comment"   
LABEL_COL = "Sentiment"

X = train_df[TEXT_COL]
y = train_df[LABEL_COL]

# label -> int
le = LabelEncoder()
y_int = le.fit_transform(y)

# 從 train_val 切 validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y_int,
    test_size=0.2,
    random_state=SEED,
    stratify=y_int
)

print("Train size:", len(X_train))
print("Val size:", len(X_val))


Train size: 11752
Val size: 2939


In [8]:
X_test = test_df[TEXT_COL]
y_test = le.transform(test_df[LABEL_COL])

print("Test size:", len(X_test))

# test 前五筆
preview = pd.DataFrame({
    "text": X_test.head().values,
    "label_int": y_test[:5],
    "label": le.inverse_transform(y_test[:5])
})
print(preview)


Test size: 3673
                                                text  label_int     label
0  “ oh god guys ’ octopus eating crab ” watches ...          0  negative
1  daughter starting th grade chem section next w...          2  positive
2  future video definitely build like huge base o...          1   neutral
3  ’ chronically ill frequently find difficult ea...          2  positive
4  pizza planet pizza awful disney sticking bit s...          0  negative


In [9]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# y_train label (0/1/2)
classes = np.unique(y_train)

class_weights_arr = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)

class_weights = dict(zip(classes, class_weights_arr))
print("Class weights:", class_weights)


Class weights: {0: 2.6185383244206775, 1: 1.3234234234234235, 2: 0.5369152046783626}


# Tokenizer + padding

In [10]:
from collections import Counter

max_words = 20000   
max_len   = 150     

class SimpleTokenizer:
    def __init__(self, num_words, oov_token="<OOV>"):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index = {}
        self.index_word = {}

    def fit_on_texts(self, texts):
        counter = Counter()
        for text in texts:
            counter.update(text.split())
        vocab = counter.most_common(self.num_words - 1)  # reserve 0 for padding, 1 for OOV
        self.word_index = {self.oov_token: 1}
        idx = 2
        for word, _ in vocab:
            if idx >= self.num_words:
                break
            self.word_index[word] = idx
            idx += 1
        self.index_word = {idx: word for word, idx in self.word_index.items()}

    def texts_to_sequences(self, texts):
        seqs = []
        for text in texts:
            seq = []
            for word in text.split():
                idx = self.word_index.get(word)
                if idx is None or idx >= self.num_words:
                    idx = self.word_index[self.oov_token]
                seq.append(idx)
            seqs.append(seq)
        return seqs

def pad_sequences_custom(seqs, maxlen, padding='post', truncating='post'):
    padded = np.zeros((len(seqs), maxlen), dtype=np.int64)
    for i, seq in enumerate(seqs):
        if len(seq) > maxlen:
            trunc = seq[-maxlen:] if truncating == 'pre' else seq[:maxlen]
        else:
            trunc = seq
        if padding == 'pre':
            padded[i, -len(trunc):] = trunc
        else:
            padded[i, :len(trunc)] = trunc
    return padded

tokenizer = SimpleTokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

def texts_to_padded(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences_custom(seqs, maxlen=max_len, padding='post', truncating='post')

X_train_pad = texts_to_padded(X_train)
X_val_pad   = texts_to_padded(X_val)
X_test_pad  = texts_to_padded(X_test)



# LSTM

In [11]:
embedding_dim = 256
lstm_units = 128
dense_units = 64
num_classes = len(le.classes_)
vocab_size = max_words

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dense_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(hidden_dim * 2, dense_dim)
        self.fc2 = nn.Linear(dense_dim, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h, _) = self.lstm(emb)
        h_cat = torch.cat((h[-2], h[-1]), dim=1)  # concat both directions
        x = torch.relu(self.fc1(h_cat))
        x = self.dropout(x)
        return self.fc2(x)

model = BiLSTMClassifier(vocab_size, embedding_dim, lstm_units, dense_units, num_classes).to(device)


In [12]:
summary(
    model,
    input_size=(1, max_len),      # batch, seq_len
    dtypes=[torch.long],
    col_names=("input_size", "output_size", "num_params"),
)


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
BiLSTMClassifier                         [1, 150]                  [1, 3]                    --
├─Embedding: 1-1                         [1, 150]                  [1, 150, 256]             5,120,000
├─LSTM: 1-2                              [1, 150, 256]             [1, 150, 256]             395,264
├─Linear: 1-3                            [1, 256]                  [1, 64]                   16,448
├─Dropout: 1-4                           [1, 64]                   [1, 64]                   --
├─Linear: 1-5                            [1, 64]                   [1, 3]                    195
Total params: 5,531,907
Trainable params: 5,531,907
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 64.43
Input size (MB): 0.00
Forward/backward pass size (MB): 0.61
Params size (MB): 22.13
Estimated Total Size (MB): 22.74

In [13]:
class_weights_tensor = torch.tensor(
    [class_weights.get(i, 1.0) for i in range(num_classes)],
    dtype=torch.float32,
    device=device
)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

batch_size = 128

train_ds = TensorDataset(
    torch.tensor(X_train_pad, dtype=torch.long),
    torch.tensor(y_train, dtype=torch.long)
)
val_ds = TensorDataset(
    torch.tensor(X_val_pad, dtype=torch.long),
    torch.tensor(y_val, dtype=torch.long)
)
test_ds = TensorDataset(torch.tensor(X_test_pad, dtype=torch.long))

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size)
test_dl = DataLoader(test_ds, batch_size=256)



In [14]:
num_epochs = 10
patience = 3
best_val = float("inf")
wait = 0

MODEL_PATH = "outputs/lstm/best_lstm.pt"
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    running_train_loss = 0.0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        running_train_loss += loss.item() * xb.size(0)

    train_loss = running_train_loss / len(train_ds)

    model.eval()
    running_val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for xb, yb in val_dl:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = criterion(out, yb)
            running_val_loss += loss.item() * xb.size(0)
            pred = out.argmax(dim=1)
            correct += (pred == yb).sum().item()
            total += yb.size(0)

    val_loss = running_val_loss / len(val_ds)
    val_acc = correct / total

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        wait = 0
        torch.save(model.state_dict(), MODEL_PATH)
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping")
            break


Epoch 1: train_loss=1.0450, val_loss=0.9788, val_acc=0.4750
Epoch 2: train_loss=0.8786, val_loss=0.8903, val_acc=0.6257
Epoch 3: train_loss=0.6367, val_loss=0.8711, val_acc=0.6444
Epoch 4: train_loss=0.3895, val_loss=1.0260, val_acc=0.6870
Epoch 5: train_loss=0.2074, val_loss=1.1934, val_acc=0.6689
Epoch 6: train_loss=0.1152, val_loss=1.5156, val_acc=0.6897
Early stopping


# test set 預測 + 存CSV

In [15]:
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(20000, 256, padding_idx=0)
  (lstm): LSTM(256, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=3, bias=True)
)

In [16]:
import numpy as np

probas = []
with torch.no_grad():
    for (xb,) in test_dl:
        xb = xb.to(device)
        out = model(xb)
        probas.append(torch.softmax(out, dim=1).cpu())

y_test_proba = torch.cat(probas, dim=0).numpy()
y_test_pred  = np.argmax(y_test_proba, axis=1)

In [17]:
pred_df = pd.DataFrame({
    "id": X_test.index,
    "comment": test_df.loc[X_test.index, "Comment"].values,
    "true_label": y_test.astype(int),
    "pred_label": y_test_pred.astype(int),
})

label_to_id = {cls: idx for idx, cls in enumerate(le.classes_)}
for idx, cls in enumerate(le.classes_):
    pred_df[f"prob_{label_to_id[cls]}"] = y_test_proba[:, idx]


In [18]:
OUTPUT_CSV = f"outputs/lstm/test_predictions_rs{SEED}.csv"
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
pred_df.to_csv(OUTPUT_CSV, index=False)
print("Saved to", OUTPUT_CSV)


Saved to outputs/lstm/test_predictions_rs42.csv


In [19]:
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    confusion_matrix
)

# 1. Accuracy + classification report
acc = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {acc:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4, target_names=le.classes_))

# 2. One-vs-Rest AUC（正 vs 其他、中 vs 其他、負 vs 其他）
print("\nOne-vs-Rest AUC:")
for idx, cls_name in enumerate(le.classes_):
    # 這類當成 1，其它類當成 0
    y_true_bin = (y_test == idx).astype(int)
    auc = roc_auc_score(y_true_bin, y_test_proba[:, idx])
    print(f"AUC for {cls_name} ({idx}): {auc:.4f}")

# 3. Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (rows = true, cols = predicted):")
print(cm)


Accuracy: 0.6493

Classification Report:
              precision    recall  f1-score   support

    negative     0.3175    0.5867    0.4120       467
     neutral     0.5145    0.5751    0.5431       925
    positive     0.8891    0.6922    0.7784      2281

    accuracy                         0.6493      3673
   macro avg     0.5737    0.6180    0.5779      3673
weighted avg     0.7221    0.6493    0.6726      3673


One-vs-Rest AUC:
AUC for negative (0): 0.7987
AUC for neutral (1): 0.7860
AUC for positive (2): 0.8498

Confusion Matrix (rows = true, cols = predicted):
[[ 274  123   70]
 [ 266  532  127]
 [ 323  379 1579]]
