In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
import os
from nltk.corpus import stopwords
import torch
import torch.nn as nn
from transformers import AutoModel
import random
from tqdm import tqdm
import re
import string
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

# ------------------------------
# Config
# ------------------------------
MODEL_NAMES = ["indobenchmark/indobert-large-p1"]
MODEL_NAME = "indolem/indobertweet-base-uncased"
LABEL2INDEX = {'love': 0, 'anger': 1, 'sadness': 2, 'happy': 3, 'fear': 4}
INDEX2LABEL = {v: k for k, v in LABEL2INDEX.items()}
NUM_LABELS = len(LABEL2INDEX)
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 25
LR = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
AUG_FRAC=0.2
indo_stop_words = stopwords.words(fileids='indonesian')
indo_stop_words.append("[USERNAME]")
indo_stop_words.append("url")

In [None]:


# ------------------------------
# Dataset Class
# ------------------------------
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }



In [None]:

class IndoBertClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME,
                 dense_1=64, dense_2=16, dropout=0.05, num_labels=5):
        super(IndoBertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Layers according to your Keras architecture
        self.pool = nn.AdaptiveMaxPool1d(1)   # GlobalMaxPool1D equivalent
        self.fc1 = nn.Linear(hidden_size, dense_1)
        self.drop1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(dense_1, dense_2)
        self.fc3 = nn.Linear(dense_2, num_labels)
        self.act_relu = nn.ReLU()
        self.act_sigmoid = nn.Sigmoid()       # multi-label case

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)

        # PyTorch pooling works on (N, C, L), so permute first
        x = embeddings.permute(0, 2, 1)        # (batch, hidden, seq_len)
        x = self.pool(x).squeeze(-1)           # (batch, hidden)

        x = self.fc1(x)
        x = self.act_relu(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.act_relu(x)

        logits = self.fc3(x)
        out = self.act_sigmoid(logits)
        return out


In [None]:

# ------------------------------
# Training & Evaluation Functions
# ------------------------------
def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [None]:


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [None]:
    # Read dataset (must have "text" and "label" columns)
    df = pd.read_csv("train.csv")
    df

Unnamed: 0.1,Unnamed: 0,label,tweet
0,531,anger,"Palembang, bersikap baik, jangan main-main den..."
1,4145,sadness,kl gabisa ngurus anak gausah punya anak sih ya...
2,3815,fear,[USERNAME] [USERNAME] [USERNAME] Iya.. berbaga...
3,891,love,seluruh hidup saya. saya dedikasikan dan saya ...
4,485,fear,Ngasih tehaer ke keponakan. Baju dua biji cela...
...,...,...,...
8357,3950,sadness,doi liat tadinya mau di bersihin ama doi tapi ...
8358,2803,anger,"apa tdk ada kata lain, yg dipelajari anak2 dis..."
8359,2755,love,[USERNAME] Couple saya juga ga pernah manggil ...
8360,2815,love,Sampai salah satu pacar Anda pulang dan memint...


In [None]:

    # df['tweet'] = df['tweet'].apply(lambda x: clean_text(x))
    # df = augment_dataset(df, frac=AUG_FRAC)
    df["label"] = df["label"].map(LABEL2INDEX)

    texts = df["tweet"].tolist()
    labels = df["label"].tolist()



In [None]:

    for i, model in enumerate(MODEL_NAMES) :
      tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

      dataset = TweetDataset(texts, labels, tokenizer)
      # Split into train/val (80/20)
      train_size = int(0.8 * len(dataset))
      val_size = len(dataset) - train_size
      train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
      train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
      val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
          # Model, Loss, Optimizer
      model = IndoBertClassifier().to(DEVICE)
      criterion = nn.CrossEntropyLoss()
      optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
          # Training Loop
      for epoch in range(EPOCHS):
          train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, criterion)
          val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion)

          print(f"Epoch {epoch+1}/{EPOCHS}")
          print(f"  Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
          print(f"  Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

      # Save Model
      os.makedirs("saved_model", exist_ok=True)
      torch.save(model.state_dict(), f"saved_model/model_{i}.pt")
      print(f"Model saved to saved_model/model_{i}.pt")
      print("---------------------------------------------------------------------------")


Epoch 1/25
  Train Loss: 1.5672 | Acc: 0.3649 | F1: 0.2977
  Val   Loss: 1.4971 | Acc: 0.5824 | F1: 0.5207
Epoch 2/25
  Train Loss: 1.4285 | Acc: 0.6488 | F1: 0.6405
  Val   Loss: 1.3775 | Acc: 0.7209 | F1: 0.7165
Epoch 3/25
  Train Loss: 1.3063 | Acc: 0.7550 | F1: 0.7534
  Val   Loss: 1.3034 | Acc: 0.7437 | F1: 0.7394
Epoch 4/25
  Train Loss: 1.1944 | Acc: 0.8276 | F1: 0.8258
  Val   Loss: 1.2490 | Acc: 0.7585 | F1: 0.7564
Epoch 5/25
  Train Loss: 1.1125 | Acc: 0.8776 | F1: 0.8767
  Val   Loss: 1.2096 | Acc: 0.7659 | F1: 0.7665
Epoch 6/25
  Train Loss: 1.0537 | Acc: 0.9090 | F1: 0.9084
  Val   Loss: 1.1902 | Acc: 0.7702 | F1: 0.7698
Epoch 7/25
  Train Loss: 1.0142 | Acc: 0.9330 | F1: 0.9327
  Val   Loss: 1.1812 | Acc: 0.7665 | F1: 0.7689
Epoch 8/25
  Train Loss: 0.9918 | Acc: 0.9430 | F1: 0.9429
  Val   Loss: 1.1686 | Acc: 0.7690 | F1: 0.7671
Epoch 9/25
  Train Loss: 0.9804 | Acc: 0.9462 | F1: 0.9460
  Val   Loss: 1.1632 | Acc: 0.7708 | F1: 0.7714
Epoch 10/25
  Train Loss: 0.9665 | Ac

In [None]:
# "indobenchmark/indobert-base-p1"
# Epoch 1/25
#   Train Loss: 1.5805 | Acc: 0.3224 | F1: 0.2720
#   Val   Loss: 1.5250 | Acc: 0.5690 | F1: 0.5375
# Epoch 2/25
#   Train Loss: 1.4280 | Acc: 0.6478 | F1: 0.6362
#   Val   Loss: 1.3709 | Acc: 0.6707 | F1: 0.6737
# Epoch 3/25
#   Train Loss: 1.2843 | Acc: 0.7693 | F1: 0.7678
#   Val   Loss: 1.2917 | Acc: 0.7101 | F1: 0.7099
# Epoch 4/25
#   Train Loss: 1.1860 | Acc: 0.8438 | F1: 0.8432
#   Val   Loss: 1.2559 | Acc: 0.7065 | F1: 0.7092
# Epoch 5/25
#   Train Loss: 1.1079 | Acc: 0.9023 | F1: 0.9020
#   Val   Loss: 1.2220 | Acc: 0.7256 | F1: 0.7272
# Epoch 6/25
#   Train Loss: 1.0565 | Acc: 0.9318 | F1: 0.9315
#   Val   Loss: 1.2135 | Acc: 0.7185 | F1: 0.7213
# Epoch 7/25
#   Train Loss: 1.0246 | Acc: 0.9459 | F1: 0.9458
#   Val   Loss: 1.1831 | Acc: 0.7394 | F1: 0.7418
# Epoch 8/25
#   Train Loss: 1.0016 | Acc: 0.9595 | F1: 0.9594
#   Val   Loss: 1.2038 | Acc: 0.7095 | F1: 0.7135
# Epoch 9/25
#   Train Loss: 0.9869 | Acc: 0.9664 | F1: 0.9663
#   Val   Loss: 1.1874 | Acc: 0.7203 | F1: 0.7236
# Epoch 10/25
#   Train Loss: 0.9758 | Acc: 0.9730 | F1: 0.9730
#   Val   Loss: 1.1767 | Acc: 0.7256 | F1: 0.7260
# Epoch 11/25
#   Train Loss: 0.9688 | Acc: 0.9750 | F1: 0.9750
#   Val   Loss: 1.1786 | Acc: 0.7298 | F1: 0.7312
# Epoch 12/25
#   Train Loss: 0.9621 | Acc: 0.9803 | F1: 0.9802
#   Val   Loss: 1.1700 | Acc: 0.7322 | F1: 0.7300
# Epoch 13/25
#   Train Loss: 0.9566 | Acc: 0.9846 | F1: 0.9845
#   Val   Loss: 1.1704 | Acc: 0.7322 | F1: 0.7355
# Epoch 14/25
#   Train Loss: 0.9540 | Acc: 0.9861 | F1: 0.9861
#   Val   Loss: 1.1655 | Acc: 0.7340 | F1: 0.7350
# Epoch 15/25
#   Train Loss: 0.9523 | Acc: 0.9867 | F1: 0.9867
#   Val   Loss: 1.1690 | Acc: 0.7256 | F1: 0.7276
# Epoch 16/25
#   Train Loss: 0.9484 | Acc: 0.9901 | F1: 0.9901
#   Val   Loss: 1.1819 | Acc: 0.7161 | F1: 0.7180
# Epoch 17/25
#   Train Loss: 0.9468 | Acc: 0.9907 | F1: 0.9907
#   Val   Loss: 1.1730 | Acc: 0.7274 | F1: 0.7276
# Epoch 18/25
#   Train Loss: 0.9440 | Acc: 0.9934 | F1: 0.9933
#   Val   Loss: 1.1763 | Acc: 0.7256 | F1: 0.7293
# Epoch 19/25
#   Train Loss: 0.9414 | Acc: 0.9950 | F1: 0.9950
#   Val   Loss: 1.1653 | Acc: 0.7322 | F1: 0.7333
# Epoch 20/25
#   Train Loss: 0.9400 | Acc: 0.9967 | F1: 0.9967
#   Val   Loss: 1.1744 | Acc: 0.7250 | F1: 0.7260
# Epoch 21/25
#   Train Loss: 0.9409 | Acc: 0.9946 | F1: 0.9946
#   Val   Loss: 1.1742 | Acc: 0.7221 | F1: 0.7271
# Epoch 22/25
#   Train Loss: 0.9389 | Acc: 0.9969 | F1: 0.9969
#   Val   Loss: 1.1702 | Acc: 0.7292 | F1: 0.7318
# Epoch 23/25
#   Train Loss: 0.9409 | Acc: 0.9943 | F1: 0.9942
#   Val   Loss: 1.1796 | Acc: 0.7215 | F1: 0.7254
# Epoch 24/25
#   Train Loss: 0.9388 | Acc: 0.9966 | F1: 0.9966
#   Val   Loss: 1.1661 | Acc: 0.7340 | F1: 0.7339
# Epoch 25/25
#   Train Loss: 0.9385 | Acc: 0.9966 | F1: 0.9966
#   Val   Loss: 1.1730 | Acc: 0.7274 | F1: 0.7306
# Model saved to saved_model/model_0.pt
# ---------------------------------------------------------------------------

In [None]:
# "indobenchmark/indobert-large-p1"
# Epoch 1/25
#   Train Loss: 1.5672 | Acc: 0.3649 | F1: 0.2977
#   Val   Loss: 1.4971 | Acc: 0.5824 | F1: 0.5207
# Epoch 2/25
#   Train Loss: 1.4285 | Acc: 0.6488 | F1: 0.6405
#   Val   Loss: 1.3775 | Acc: 0.7209 | F1: 0.7165
# Epoch 3/25
#   Train Loss: 1.3063 | Acc: 0.7550 | F1: 0.7534
#   Val   Loss: 1.3034 | Acc: 0.7437 | F1: 0.7394
# Epoch 4/25
#   Train Loss: 1.1944 | Acc: 0.8276 | F1: 0.8258
#   Val   Loss: 1.2490 | Acc: 0.7585 | F1: 0.7564
# Epoch 5/25
#   Train Loss: 1.1125 | Acc: 0.8776 | F1: 0.8767
#   Val   Loss: 1.2096 | Acc: 0.7659 | F1: 0.7665
# Epoch 6/25
#   Train Loss: 1.0537 | Acc: 0.9090 | F1: 0.9084
#   Val   Loss: 1.1902 | Acc: 0.7702 | F1: 0.7698
# Epoch 7/25
#   Train Loss: 1.0142 | Acc: 0.9330 | F1: 0.9327
#   Val   Loss: 1.1812 | Acc: 0.7665 | F1: 0.7689
# Epoch 8/25
#   Train Loss: 0.9918 | Acc: 0.9430 | F1: 0.9429
#   Val   Loss: 1.1686 | Acc: 0.7690 | F1: 0.7671
# Epoch 9/25
#   Train Loss: 0.9804 | Acc: 0.9462 | F1: 0.9460
#   Val   Loss: 1.1632 | Acc: 0.7708 | F1: 0.7714
# Epoch 10/25
#   Train Loss: 0.9665 | Acc: 0.9550 | F1: 0.9549
#   Val   Loss: 1.1691 | Acc: 0.7566 | F1: 0.7574
# Epoch 11/25
#   Train Loss: 0.9639 | Acc: 0.9534 | F1: 0.9533
#   Val   Loss: 1.1488 | Acc: 0.7763 | F1: 0.7760
# Epoch 12/25
#   Train Loss: 0.9564 | Acc: 0.9589 | F1: 0.9588
#   Val   Loss: 1.1536 | Acc: 0.7690 | F1: 0.7653
# Epoch 13/25
#   Train Loss: 0.9503 | Acc: 0.9634 | F1: 0.9633
#   Val   Loss: 1.1598 | Acc: 0.7640 | F1: 0.7603
# Epoch 14/25
#   Train Loss: 0.9474 | Acc: 0.9643 | F1: 0.9642
#   Val   Loss: 1.1493 | Acc: 0.7628 | F1: 0.7656
# Epoch 15/25
#   Train Loss: 0.9444 | Acc: 0.9662 | F1: 0.9662
#   Val   Loss: 1.1430 | Acc: 0.7776 | F1: 0.7788
# Epoch 16/25
#   Train Loss: 0.9433 | Acc: 0.9664 | F1: 0.9663
#   Val   Loss: 1.1456 | Acc: 0.7727 | F1: 0.7717
# Epoch 17/25
#   Train Loss: 0.9410 | Acc: 0.9682 | F1: 0.9681
#   Val   Loss: 1.1586 | Acc: 0.7585 | F1: 0.7602
# Epoch 18/25
#   Train Loss: 0.9421 | Acc: 0.9661 | F1: 0.9660
#   Val   Loss: 1.1536 | Acc: 0.7628 | F1: 0.7597
# Epoch 19/25
#   Train Loss: 0.9437 | Acc: 0.9632 | F1: 0.9632
#   Val   Loss: 1.1445 | Acc: 0.7702 | F1: 0.7735
# Epoch 20/25
#   Train Loss: 0.9384 | Acc: 0.9694 | F1: 0.9693
#   Val   Loss: 1.1390 | Acc: 0.7770 | F1: 0.7760
# Epoch 21/25
#   Train Loss: 0.9367 | Acc: 0.9702 | F1: 0.9702
#   Val   Loss: 1.1366 | Acc: 0.7757 | F1: 0.7752