In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
import os


In [2]:

# ------------------------------
# Config
# ------------------------------
MODEL_NAME = "indolem/indobertweet-base-uncased"
LABEL2INDEX = {'love': 0, 'anger': 1, 'sadness': 2, 'happy': 3, 'fear': 4}
INDEX2LABEL = {v: k for k, v in LABEL2INDEX.items()}
NUM_LABELS = len(LABEL2INDEX)
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 25
LR = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:


# ------------------------------
# Dataset Class
# ------------------------------
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }



In [4]:
import torch
import torch.nn as nn
from transformers import AutoModel

class IndoBertClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME,
                 conv_out_channels=128, conv_kernel_size=3,
                 dense_1=64, dense_2=16, dropout=0.05, num_labels=5):
        super(IndoBertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # --- Convolutional Layer added here ---
        # Conv1D expects (batch, channels, seq_len)
        self.conv1 = nn.Conv1d(
            in_channels=hidden_size,
            out_channels=conv_out_channels,
            kernel_size=conv_kernel_size,
            padding=conv_kernel_size // 2   # keeps same seq_len
        )
        self.act_conv = nn.ReLU()

        # --- Pooling and Dense Layers ---
        self.pool = nn.AdaptiveMaxPool1d(1)   # GlobalMaxPool1D equivalent
        self.fc1 = nn.Linear(conv_out_channels, dense_1)
        self.drop1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(dense_1, dense_2)
        self.fc3 = nn.Linear(dense_2, num_labels)
        self.act_relu = nn.ReLU()
        self.act_sigmoid = nn.Sigmoid()       # multi-label case

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # (batch, seq_len, hidden)

        # Prepare for Conv1D
        x = embeddings.permute(0, 2, 1)  # (batch, hidden, seq_len)

        # --- Apply convolutional feature extraction ---
        x = self.conv1(x)
        x = self.act_conv(x)

        # --- Pooling over sequence ---
        x = self.pool(x).squeeze(-1)  # (batch, conv_out_channels)

        # --- Dense Layers ---
        x = self.fc1(x)
        x = self.act_relu(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.act_relu(x)

        logits = self.fc3(x)
        out = self.act_sigmoid(logits)
        return out


In [5]:

# ------------------------------
# Training & Evaluation Functions
# ------------------------------
def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [6]:


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [7]:


# ------------------------------
# Main Training Script
# ------------------------------
if __name__ == "__main__":
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Read dataset (must have "text" and "label" columns)
    df = pd.read_csv("Twitter_Emotion_Dataset.csv")
    df["label"] = df["label"].map(LABEL2INDEX)

    texts = df["tweet"].tolist()
    labels = df["label"].tolist()

    dataset = TweetDataset(texts, labels, tokenizer)

    # Split into train/val (80/20)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Model, Loss, Optimizer
    model = IndoBertClassifier().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

    # Training Loop
    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"  Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
        print(f"  Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

    # Save Model
    os.makedirs("saved_model", exist_ok=True)
    torch.save(model.state_dict(), "saved_model/indobert_tweet_classifier_aug.pt")
    print("Model saved to saved_model/indobert_tweet_classifier_aug.pt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Epoch 1/25
  Train Loss: 1.6006 | Acc: 0.2554 | F1: 0.2156
  Val   Loss: 1.5783 | Acc: 0.5062 | F1: 0.4473
Epoch 2/25
  Train Loss: 1.5200 | Acc: 0.5705 | F1: 0.5552
  Val   Loss: 1.4657 | Acc: 0.6005 | F1: 0.5874
Epoch 3/25
  Train Loss: 1.4029 | Acc: 0.7159 | F1: 0.7122
  Val   Loss: 1.3645 | Acc: 0.7423 | F1: 0.7393
Epoch 4/25
  Train Loss: 1.2948 | Acc: 0.8330 | F1: 0.8323
  Val   Loss: 1.2905 | Acc: 0.7696 | F1: 0.7727
Epoch 5/25
  Train Loss: 1.2063 | Acc: 0.8625 | F1: 0.8611
  Val   Loss: 1.2374 | Acc: 0.7719 | F1: 0.7726
Epoch 6/25
  Train Loss: 1.1304 | Acc: 0.8946 | F1: 0.8932
  Val   Loss: 1.2019 | Acc: 0.7821 | F1: 0.7814
Epoch 7/25
  Train Loss: 1.0762 | Acc: 0.9213 | F1: 0.9209
  Val   Loss: 1.1761 | Acc: 0.7775 | F1: 0.7783
Epoch 8/25
  Train Loss: 1.0331 | Acc: 0.9435 | F1: 0.9433
  Val   Loss: 1.1510 | Acc: 0.7866 | F1: 0.7878
Epoch 9/25
  Train Loss: 0.9990 | Acc: 0.9588 | F1: 0.9587
  Val   Loss: 1.1462 | Acc: 0.7809 | F1: 0.7788
Epoch 10/25
  Train Loss: 0.9796 | Ac