In [13]:
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
import os
from nltk.corpus import stopwords
import torch
import torch.nn as nn
from transformers import AutoModel
import numpy as np
from sklearn.model_selection import train_test_split
from typing import List, Tuple
import numpy as np


In [14]:

# ------------------------------
# Config
# ------------------------------
MODELS_NAME = {"indoroberta": "flax-community/indonesian-roberta-base"}
DATAFRAME_PATH = "all_back_translation.csv"
TWEETS_COLUMNS = ["tweet", "1p_bt_tweet", "2p_bt_tweet"]
AUG_STYLES = ["1pBT"]  # "augment",  "2pBT"
TARGET_COLUMN = "label"
LABEL2INDEX = {'love': 0, 'anger': 1, 'sadness': 2, 'happy': 3, 'fear': 4}
INDEX2LABEL = {v: k for k, v in LABEL2INDEX.items()}
NUM_LABELS = len(LABEL2INDEX)
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 25
LR = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VAL_SIZE = int(4400*0.05)
torch.manual_seed(0)
np.random.seed(0)


In [15]:


# ------------------------------
# Dataset Class
# ------------------------------
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }



In [16]:

class IndoBertClassifier(nn.Module):
    def __init__(self, model_name,
                 dense_1=64, dense_2=16, dropout=0.05, num_labels=5):
        super(IndoBertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Layers according to your Keras architecture
        self.pool = nn.AdaptiveMaxPool1d(1)  # GlobalMaxPool1D equivalent
        self.fc1 = nn.Linear(hidden_size, dense_1)
        self.drop1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(dense_1, dense_2)
        self.fc3 = nn.Linear(dense_2, num_labels)
        self.act_relu = nn.ReLU()
        self.act_sigmoid = nn.Sigmoid()  # multi-label case

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)

        # PyTorch pooling works on (N, C, L), so permute first
        x = embeddings.permute(0, 2, 1)  # (batch, hidden, seq_len)
        x = self.pool(x).squeeze(-1)  # (batch, hidden)

        x = self.fc1(x)
        x = self.act_relu(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.act_relu(x)

        logits = self.fc3(x)
        out = self.act_sigmoid(logits)
        return out


In [17]:

# ------------------------------
# Training & Evaluation Functions
# ------------------------------
def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [18]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [19]:
from typing import List, Tuple
import pandas as pd

def adjust_data(aug_style: str, df: pd.DataFrame) -> Tuple[List[str], List[int], List[str], List[int]]:

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle first

    val_size = VAL_SIZE
    df_val = df.iloc[:val_size]
    df_train = df.iloc[val_size:]

    val_tweets = df_val[TWEETS_COLUMNS[0]].tolist()
    val_labels = df_val[TARGET_COLUMN].tolist()

    tweets = df_train[TWEETS_COLUMNS[0]].tolist()
    labels = df_train[TARGET_COLUMN].tolist()

    if aug_style == "augment":
        nat_augment_dataset = augment_dataset(df_train, text_col=TWEETS_COLUMNS[0])
        tweets = tweets + nat_augment_dataset[TWEETS_COLUMNS[0]].tolist()
        labels = labels + nat_augment_dataset[TARGET_COLUMN].tolist()

    elif aug_style == "1pBT":
        origin_tweets = df_train[TWEETS_COLUMNS[0]].tolist()
        one_phase_BT_tweets = df_train[TWEETS_COLUMNS[1]].tolist()
        tweets = origin_tweets + one_phase_BT_tweets #+ df_val[TWEETS_COLUMNS[1]].tolist()

        labels = df_train[TARGET_COLUMN].tolist()
        labels = labels + labels #+ df_val[TARGET_COLUMN].tolist() # duplicate labels

    elif aug_style == "2pBT":
        origin_tweets = df_train[TWEETS_COLUMNS[0]].tolist()
        two_phase_BT_tweets = df_train[TWEETS_COLUMNS[2]].tolist()
        tweets = origin_tweets + two_phase_BT_tweets #+ df_val[TWEETS_COLUMNS[2]].tolist()


        labels = df_train[TARGET_COLUMN].tolist()
        labels = labels + labels #+ df_val[TARGET_COLUMN].tolist()

    return tweets, val_tweets, labels, val_labels


In [20]:
import random

import pandas as pd
from tqdm import tqdm


def random_deletion(words, p=0.2):
    """ Randomly delete words with probability p """
    if len(words) == 1:
        return words
    return [w for w in words if random.uniform(0, 1) > p]


def random_swap(words, n=1):
    """ Swap two words n times """
    words = words.copy()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return words


def random_insertion(words, n=1):
    """ Insert a random word from the sentence into a random position """
    words = words.copy()
    for _ in range(n):
        new_word = random.choice(words)
        insert_pos = random.randint(0, len(words))
        words.insert(insert_pos, new_word)
    return words


def augment_text(text):
    words = text.split()
    if len(words) == 1:
        return text
    choice = random.choice(['delete', 'swap', 'insert'])

    if choice == 'delete':
        aug_words = random_deletion(words)
    elif choice == 'swap':
        aug_words = random_swap(words)
    elif choice == 'insert':
        aug_words = random_insertion(words)

    return " ".join(aug_words)

def augment_dataset(df:pd.DataFrame , text_col="tweet", label_col="label", frac=0.49):
    """
    Augment a fraction of dataset rows and return a bigger dataset
    """
    # sample rows for augmentation
    sampled = df.sample(frac=frac, random_state=42)

    augmented_texts = []
    augmented_labels = []

    for _, row in tqdm(sampled.iterrows()):
        aug_text = augment_text(row[text_col])  # from previous code
        augmented_texts.append(aug_text)
        augmented_labels.append(row[label_col])

    # make augmented dataframe
    df_aug = pd.DataFrame({text_col: augmented_texts, label_col: augmented_labels})

    # concatenate with original
    df_new = pd.concat([df, df_aug], ignore_index=True)
    return df_new


In [21]:
def save_to_df(tweets, labels, file_name ):
    df_final = pd.DataFrame({
    TWEETS_COLUMNS[0]: tweets,
    TARGET_COLUMN: labels})
    df_final.to_csv(f'{file_name}', index=False)

In [22]:

# Read dataset (must have "text" and "label" columns)
df = pd.read_csv(DATAFRAME_PATH)
df

Unnamed: 0.1,Unnamed: 0,label,tweet,2p_bt_tweet,1p_bt_tweet
0,0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...","Sedangkan Jatiro, polisi tidak bisa berurusan ...","Adapun Jaitiru, polisi tidak bisa menangani gu..."
1,1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi...","Kau tahu, itu seharusnya gadis-gadis yang haru...","Kau tahu, itu seharusnya gadis-gadis yang haru..."
2,2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,"Amad Photo dari Google, sengaja, sehingga tema...","Amad Photo dari Google, sengaja, sehingga tema..."
3,3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...","Jalan Jatararu, bagian dari Greatment Mr Broth...","Jataru Street, bagian dari wilayah Mr. Brother..."
4,4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata...","Berbagi pengalaman, kemarin pukul 6: 00 pm, me...","Berbagi pengalaman, kemarin pukul 6.00 sore, m..."
...,...,...,...,...,...
4396,4396,love,"Tahukah kamu, bahwa saat itu papa memejamkan m...",Apakah Anda tahu bahwa pada saat itu aku memej...,Apakah kau tahu bahwa pada saat itu aku menutu...
4397,4397,fear,Sulitnya menetapkan Calon Wapresnya Jokowi di ...,Kesulitan untuk menetapkan pemilihan Wapres Ja...,Kesulitan dalam menetapkan pemilihan Wakil Pre...
4398,4398,anger,"5. masa depannya nggak jelas. lha iya, gimana ...",Lima... Masa depannya tidak pasti.,"5 tahun, masa depannya tidak jelas."
4399,4399,happy,[USERNAME] dulu beneran ada mahasiswa Teknik U...,Sebenarnya ada seorang mahasiswa teknologi UI ...,Sebenarnya ada seorang mahasiswa UI Techniques...


In [23]:
df[TARGET_COLUMN] = df[TARGET_COLUMN].map(LABEL2INDEX)
df


Unnamed: 0.1,Unnamed: 0,label,tweet,2p_bt_tweet,1p_bt_tweet
0,0,1,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...","Sedangkan Jatiro, polisi tidak bisa berurusan ...","Adapun Jaitiru, polisi tidak bisa menangani gu..."
1,1,1,"Sesama cewe lho (kayaknya), harusnya bisa lebi...","Kau tahu, itu seharusnya gadis-gadis yang haru...","Kau tahu, itu seharusnya gadis-gadis yang haru..."
2,2,3,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,"Amad Photo dari Google, sengaja, sehingga tema...","Amad Photo dari Google, sengaja, sehingga tema..."
3,3,1,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...","Jalan Jatararu, bagian dari Greatment Mr Broth...","Jataru Street, bagian dari wilayah Mr. Brother..."
4,4,3,"Sharing pengalaman aja, kemarin jam 18.00 bata...","Berbagi pengalaman, kemarin pukul 6: 00 pm, me...","Berbagi pengalaman, kemarin pukul 6.00 sore, m..."
...,...,...,...,...,...
4396,4396,0,"Tahukah kamu, bahwa saat itu papa memejamkan m...",Apakah Anda tahu bahwa pada saat itu aku memej...,Apakah kau tahu bahwa pada saat itu aku menutu...
4397,4397,4,Sulitnya menetapkan Calon Wapresnya Jokowi di ...,Kesulitan untuk menetapkan pemilihan Wapres Ja...,Kesulitan dalam menetapkan pemilihan Wakil Pre...
4398,4398,1,"5. masa depannya nggak jelas. lha iya, gimana ...",Lima... Masa depannya tidak pasti.,"5 tahun, masa depannya tidak jelas."
4399,4399,3,[USERNAME] dulu beneran ada mahasiswa Teknik U...,Sebenarnya ada seorang mahasiswa teknologi UI ...,Sebenarnya ada seorang mahasiswa UI Techniques...


In [24]:
for model_name, model_path in MODELS_NAME.items():
  for aug_style in AUG_STYLES:
        # Load tokenizer
      tokenizer = AutoTokenizer.from_pretrained(model_path)
      X_train, X_test, y_train, y_test = adjust_data(df=df, aug_style=aug_style)
      train_dataset = TweetDataset(X_train, y_train, tokenizer)
      val_dataset = TweetDataset(X_test, y_test, tokenizer)
      train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
      val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
      # Model, Loss, Optimizer
      model = IndoBertClassifier(model_name=model_path).to(DEVICE)
      criterion = nn.CrossEntropyLoss()
      optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
      f1_best = 0
      best_model = None
      # Training Loop
      for epoch in range(EPOCHS):
          train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, criterion)
          val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion)

          print(f"Epoch {epoch + 1}/{EPOCHS}")
          print(f"  Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
          print(f"  Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")
          if val_f1 > f1_best:
              f1_best = val_f1
              best_model = model.state_dict()
              if val_f1 > 0.8:
                  save_to_df(X_train, y_train, file_name='train')
                  save_to_df(X_test, y_test, file_name='test')

      # Save Model
      os.makedirs("saved_model", exist_ok=True)
      torch.save(best_model, f"saved_model/{model_name}_{aug_style}.pt")
      print(f"Model saved to saved_model/{model_name}_{aug_style}.pt")


Some weights of RobertaModel were not initialized from the model checkpoint at flax-community/indonesian-roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/25
  Train Loss: 1.5207 | Acc: 0.3874 | F1: 0.3345
  Val   Loss: 1.4135 | Acc: 0.6364 | F1: 0.6221
Epoch 2/25
  Train Loss: 1.3596 | Acc: 0.6203 | F1: 0.6017
  Val   Loss: 1.3067 | Acc: 0.6909 | F1: 0.6671
Epoch 3/25
  Train Loss: 1.2620 | Acc: 0.6945 | F1: 0.6740
  Val   Loss: 1.2587 | Acc: 0.7091 | F1: 0.6853
Epoch 4/25
  Train Loss: 1.1928 | Acc: 0.7569 | F1: 0.7448
  Val   Loss: 1.2099 | Acc: 0.7727 | F1: 0.7612
Epoch 5/25
  Train Loss: 1.1235 | Acc: 0.8176 | F1: 0.8151
  Val   Loss: 1.1942 | Acc: 0.7500 | F1: 0.7531
Epoch 6/25
  Train Loss: 1.0702 | Acc: 0.8584 | F1: 0.8578
  Val   Loss: 1.1689 | Acc: 0.7727 | F1: 0.7762
Epoch 7/25
  Train Loss: 1.0359 | Acc: 0.8857 | F1: 0.8854
  Val   Loss: 1.1371 | Acc: 0.7727 | F1: 0.7726
Epoch 8/25
  Train Loss: 1.0141 | Acc: 0.9004 | F1: 0.9000
  Val   Loss: 1.1146 | Acc: 0.8227 | F1: 0.8227
Epoch 9/25
  Train Loss: 0.9982 | Acc: 0.9139 | F1: 0.9138
  Val   Loss: 1.1171 | Acc: 0.7909 | F1: 0.7908
Epoch 10/25
  Train Loss: 0.9878 | Ac

In [25]:
from google.colab import files
files.download('train')
files.download('test')
for model_name, model_path in MODELS_NAME.items():
  for aug_style in AUG_STYLES:
    files.download(f"saved_model/{model_name}_{aug_style}.pt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>