In [47]:
from tkinter.tix import COLUMN

import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
import os
from nltk.corpus import stopwords
import torch
import torch.nn as nn
from transformers import AutoModel
import numpy as np
from sklearn.model_selection import train_test_split
from research.simple_aug import augment_dataset
from research.utils import clean_text, remove_stopwords
from typing import List, Tuple


In [48]:

# ------------------------------
# Config
# ------------------------------
MODEL_NAME = "indolem/indobertweet-base-uncased"
DATAFRAME_PATH = "/resources/data/all_bt_data/all_back_translation.csv"
TWEETS_COLUMNS = ["tweet", "1p_bt_tweet", "2p_bt_tweet"]
AUG_STYLES = ["natural", "augment", "1pBT", "2pBT"]  #
TARGET_COLUMN = "label"
LABEL2INDEX = {'love': 0, 'anger': 1, 'sadness': 2, 'happy': 3, 'fear': 4}
INDEX2LABEL = {v: k for k, v in LABEL2INDEX.items()}
NUM_LABELS = len(LABEL2INDEX)
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 25
LR = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
AUG_FRAC = 0.2
torch.manual_seed(42)
np.random.seed(42)

In [49]:


# ------------------------------
# Dataset Class
# ------------------------------
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }



In [50]:

class IndoBertClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME,
                 dense_1=64, dense_2=16, dropout=0.05, num_labels=5):
        super(IndoBertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Layers according to your Keras architecture
        self.pool = nn.AdaptiveMaxPool1d(1)  # GlobalMaxPool1D equivalent
        self.fc1 = nn.Linear(hidden_size, dense_1)
        self.drop1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(dense_1, dense_2)
        self.fc3 = nn.Linear(dense_2, num_labels)
        self.act_relu = nn.ReLU()
        self.act_sigmoid = nn.Sigmoid()  # multi-label case

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)

        # PyTorch pooling works on (N, C, L), so permute first
        x = embeddings.permute(0, 2, 1)  # (batch, hidden, seq_len)
        x = self.pool(x).squeeze(-1)  # (batch, hidden)

        x = self.fc1(x)
        x = self.act_relu(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.act_relu(x)

        logits = self.fc3(x)
        out = self.act_sigmoid(logits)
        return out


In [51]:

# ------------------------------
# Training & Evaluation Functions
# ------------------------------
def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [52]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [53]:
from typing import List, Tuple
import pandas as pd

def adjust_data(aug_style: str, df: pd.DataFrame) -> Tuple[List[str], List[int], List[str], List[int]]:

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle for safety
    val_size = int(len(df) * 0.25)

    df_val = df.iloc[:val_size]
    df_train = df.iloc[val_size:]

    val_tweets = df_val[TWEETS_COLUMNS[0]].tolist()
    val_labels = df_val[TARGET_COLUMN].tolist()

    tweets = df_train[TWEETS_COLUMNS[0]].tolist()
    labels = df_train[TARGET_COLUMN].tolist()

    if aug_style == "augment":
        nat_augment_dataset = augment_dataset(df_train, text_col=TWEETS_COLUMNS[0])
        tweets = nat_augment_dataset[TWEETS_COLUMNS[0]].tolist()
        labels = nat_augment_dataset[TARGET_COLUMN].tolist()

    elif aug_style == "1pBT":
        origin_tweets = df_train[TWEETS_COLUMNS[0]].tolist()
        one_phase_BT_tweets = df_train[TWEETS_COLUMNS[1]].tolist()

        tweets = origin_tweets + one_phase_BT_tweets
        labels = df_train[TARGET_COLUMN].tolist()
        labels = labels + labels  # duplicate labels

    elif aug_style == "2pBT":
        origin_tweets = df_train[TWEETS_COLUMNS[0]].tolist()
        two_phase_BT_tweets = df_train[TWEETS_COLUMNS[2]].tolist()

        tweets = origin_tweets + two_phase_BT_tweets
        labels = df_train[TARGET_COLUMN].tolist()
        labels = labels + labels

    # ----------------------------------------------------
    # Return: VAL first, then TRAIN
    # ----------------------------------------------------
    return tweets, val_tweets, labels, val_labels


In [54]:
def save_to_df(tweets, labels, file_name ):
    df_final = pd.DataFrame({
    TWEETS_COLUMNS[0]: tweets,
    TARGET_COLUMN: labels})
    df_final.to_csv(f'{file_name}', index=False)

In [None]:
    # Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Read dataset (must have "text" and "label" columns)
df = pd.read_csv(DATAFRAME_PATH)
df

In [55]:
df[TARGET_COLUMN] = df[TARGET_COLUMN].map(LABEL2INDEX)
for tweet_column in TWEETS_COLUMNS:
    df[tweet_column] = df[tweet_column].apply(clean_text)
    df[tweet_column] = df[tweet_column].apply(remove_stopwords)
df


Unnamed: 0,label,tweet,1p_bt_tweet,2p_bt_tweet
0,1,jaitiru polisi menangani gubernur emangany pol...,jln jatibarupolisi tdk bs gertak gubernur eman...,jatiro polisi berurusan gubernur amangani poli...
1,1,kau gadisgadis sibuk sibuk sakit menstruasi pa...,cewe lho kayaknya rasain sibuk jaga rasain sak...,kau gadisgadis sibuk sibuk sibuk merasakan sak...
2,3,amad photo google sengaja temanteman membayang...,kepingin gudeg mbarek bu hj amad foto google s...,amad photo google sengaja temanteman membayang...
3,1,jataru street wilayah mr brother jwb jwb penga...,jln jatibarubagian wilayah tn abangpengaturan ...,jalan jatararu greatment mr brother the jwb jw...
4,3,berbagi pengalaman kemarin sore membatalkan ti...,sharing pengalaman aja kemarin jam batalin tik...,berbagi pengalaman kemarin pm membatalkan tike...
...,...,...,...,...
4396,0,kau menutup matanya menahan gejolaknya jiwanya...,tahukah papa memejamkan matanya menahan gejola...,memejamkan mata menghentikan rewel jiwanya ben...
4397,4,kesulitan menetapkan pemilihan wakil presiden ...,sulitnya menetapkan calon wapresnya jokowi pil...,kesulitan menetapkan pemilihan wapres jacobi s...
4398,1,depannya,depannya nggak lha iya gimana coba lulusan sen...,depannya
4399,3,mahasiswa ui techniques menembak pacarnya pena...,beneran mahasiswa teknik ui nembak pacarnya pa...,mahasiswa teknologi ui menembak pacarnya kuku ...


In [59]:
for aug_style in AUG_STYLES:
    X_train, X_test, y_train, y_test = adjust_data(df=df, aug_style=aug_style)
    train_dataset = TweetDataset(X_train, y_train, tokenizer)
    val_dataset = TweetDataset(X_test, y_test, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
        # Model, Loss, Optimizer
    model = IndoBertClassifier().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    f1_best = 0
    best_model = None
    # Training Loop
    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion)

        print(f"Epoch {epoch + 1}/{EPOCHS}")
        print(f"  Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
        print(f"  Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")
        if val_f1 > f1_best:
            f1_best = val_f1
            best_model = model.state_dict()
            if val_f1 > 0.8:
                save_to_df(X_train, y_train, file_name='train')
                save_to_df(X_test, y_test, file_name='test')

    # Save Model
    os.makedirs("saved_model", exist_ok=True)
    torch.save(model.state_dict(), f"saved_model/indobert_tweet_classifier_{aug_style}.pt")
    print(f"Model saved to saved_model/indobert_tweet_classifier_{aug_style}.pt")


330it [00:00, 17949.71it/s]
