In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
import os
from nltk.corpus import stopwords
import torch
import torch.nn as nn
from transformers import AutoModel
import random
from tqdm import tqdm
import re
import string
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:

# ------------------------------
# Config
# ------------------------------
MODEL_NAME = "indolem/indobertweet-base-uncased"
LABEL2INDEX = {'love': 0, 'anger': 1, 'sadness': 2, 'happy': 3, 'fear': 4}
INDEX2LABEL = {v: k for k, v in LABEL2INDEX.items()}
NUM_LABELS = len(LABEL2INDEX)
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 25
LR = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
AUG_FRAC=0.2
indo_stop_words = stopwords.words(fileids='indonesian')
indo_stop_words.append("[USERNAME]")
indo_stop_words.append("url")

In [3]:


# ------------------------------
# Dataset Class
# ------------------------------
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }



In [4]:

class IndoBertClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME,
                 dense_1=64, dense_2=16, dropout=0.05, num_labels=5):
        super(IndoBertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Layers according to your Keras architecture
        self.pool = nn.AdaptiveMaxPool1d(1)   # GlobalMaxPool1D equivalent
        self.fc1 = nn.Linear(hidden_size, dense_1)
        self.drop1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(dense_1, dense_2)
        self.fc3 = nn.Linear(dense_2, num_labels)
        self.act_relu = nn.ReLU()
        self.act_sigmoid = nn.Sigmoid()       # multi-label case

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)

        # PyTorch pooling works on (N, C, L), so permute first
        x = embeddings.permute(0, 2, 1)        # (batch, hidden, seq_len)
        x = self.pool(x).squeeze(-1)           # (batch, hidden)

        x = self.fc1(x)
        x = self.act_relu(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.act_relu(x)

        logits = self.fc3(x)
        out = self.act_sigmoid(logits)
        return out


In [5]:

# ------------------------------
# Training & Evaluation Functions
# ------------------------------
def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [6]:


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [7]:
def random_deletion(words, p=0.2):
    """ Randomly delete words with probability p """
    if len(words) == 1:
        return words
    return [w for w in words if random.uniform(0, 1) > p]


def random_swap(words, n=1):
    """ Swap two words n times """
    words = words.copy()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return words


def random_insertion(words, n=1):
    """ Insert a random word from the sentence into a random position """
    words = words.copy()
    for _ in range(n):
        new_word = random.choice(words)
        insert_pos = random.randint(0, len(words))
        words.insert(insert_pos, new_word)
    return words


def augment_text(text):
    words = text.split()
    choice = random.choice(['delete', 'swap', 'insert'])

    if choice == 'delete':
        aug_words = random_deletion(words)
    elif choice == 'swap':
        aug_words = random_swap(words)
    elif choice == 'insert':
        aug_words = random_insertion(words)

    return " ".join(aug_words)


In [8]:
def augment_dataset(df, text_col="tweet", label_col="label", frac=0.1):
    """
    Augment a fraction of dataset rows and return a bigger dataset
    """
    # sample rows for augmentation
    sampled = df.sample(frac=frac, random_state=42)

    augmented_texts = []
    augmented_labels = []

    for _, row in tqdm(sampled.iterrows()):
        aug_text = augment_text(row[text_col])  # from previous code
        augmented_texts.append(aug_text)
        augmented_labels.append(row[label_col])

    # make augmented dataframe
    df_aug = pd.DataFrame({text_col: augmented_texts, label_col: augmented_labels})

    # concatenate with original
    df_new = pd.concat([df, df_aug], ignore_index=True)
    return df_new


In [9]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = " ".join([y for y in text.split() if y not in indo_stop_words])
    return text


  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [10]:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Read dataset (must have "text" and "label" columns)
    df = pd.read_csv("../../../../Downloads/post_bt.csv")
    # df['tweet'] = df['tweet'].apply(lambda x: clean_text(x))
    df = augment_dataset(df, frac=AUG_FRAC)
    df["label"] = df["label"].map(LABEL2INDEX)

    texts = df["tweet"].tolist()
    labels = df["label"].tolist()

    dataset = TweetDataset(texts, labels, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1760it [00:00, 20146.60it/s]


In [11]:
    # Split into train/val (80/20)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [12]:
    # Model, Loss, Optimizer
    model = IndoBertClassifier().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [13]:
    # Training Loop
    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"  Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
        print(f"  Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

    # Save Model
    os.makedirs("saved_model", exist_ok=True)
    torch.save(model.state_dict(), "saved_model/indobert_tweet_classifier.pt")
    print("Model saved to saved_model/indobert_tweet_classifier.pt")


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Epoch 1/25
  Train Loss: 1.5537 | Acc: 0.3344 | F1: 0.2835
  Val   Loss: 1.4868 | Acc: 0.5258 | F1: 0.4942
Epoch 2/25
  Train Loss: 1.4146 | Acc: 0.6210 | F1: 0.6158
  Val   Loss: 1.3698 | Acc: 0.6867 | F1: 0.6773
Epoch 3/25
  Train Loss: 1.2967 | Acc: 0.7324 | F1: 0.7289
  Val   Loss: 1.2979 | Acc: 0.7511 | F1: 0.7528
Epoch 4/25
  Train Loss: 1.1949 | Acc: 0.8199 | F1: 0.8190
  Val   Loss: 1.2369 | Acc: 0.7681 | F1: 0.7671
Epoch 5/25
  Train Loss: 1.1120 | Acc: 0.8777 | F1: 0.8773
  Val   Loss: 1.1985 | Acc: 0.7804 | F1: 0.7803
Epoch 6/25
  Train Loss: 1.0517 | Acc: 0.9127 | F1: 0.9124
  Val   Loss: 1.1771 | Acc: 0.7818 | F1: 0.7772
Epoch 7/25
  Train Loss: 1.0098 | Acc: 0.9322 | F1: 0.9320
  Val   Loss: 1.1447 | Acc: 0.7989 | F1: 0.7994
Epoch 8/25
  Train Loss: 0.9842 | Acc: 0.9464 | F1: 0.9462
  Val   Loss: 1.1487 | Acc: 0.7932 | F1: 0.7898
Epoch 9/25
  Train Loss: 0.9743 | Acc: 0.9493 | F1: 0.9492
  Val   Loss: 1.1309 | Acc: 0.8036 | F1: 0.8030
Epoch 10/25
  Train Loss: 0.9692 | Ac