In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
import os
from nltk.corpus import stopwords
import torch
import torch.nn as nn
from transformers import AutoModel
import random
from tqdm import tqdm
import re
import numpy as np
import string
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

# ------------------------------
# Config
# ------------------------------
MODEL_NAME = "avichr/heBERT_sentiment_analysis"
LABEL2INDEX = {'love': 0, 'anger': 1, 'sadness': 2, 'happy': 3, 'fear': 4}
INDEX2LABEL = {v: k for k, v in LABEL2INDEX.items()}
NUM_LABELS = len(LABEL2INDEX)
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 25
LR = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
AUG_FRAC=0.2
torch.manual_seed(42)
np.random.seed(42)

In [None]:
# ------------------------------
# Dataset Class
# ------------------------------
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }



In [None]:

class IndoBertClassifier(nn.Module):
    def __init__(self, model_name=MODEL_NAME,
                 dense_1=64, dense_2=16, dropout=0.05, num_labels=5):
        super(IndoBertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Layers according to your Keras architecture
        self.pool = nn.AdaptiveMaxPool1d(1)   # GlobalMaxPool1D equivalent
        self.fc1 = nn.Linear(hidden_size, dense_1)
        self.drop1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(dense_1, dense_2)
        self.fc3 = nn.Linear(dense_2, num_labels)
        self.act_relu = nn.ReLU()
        self.act_sigmoid = nn.Sigmoid()       # multi-label case

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # shape: (batch, seq_len, hidden)

        # PyTorch pooling works on (N, C, L), so permute first
        x = embeddings.permute(0, 2, 1)        # (batch, hidden, seq_len)
        x = self.pool(x).squeeze(-1)           # (batch, hidden)

        x = self.fc1(x)
        x = self.act_relu(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.act_relu(x)

        logits = self.fc3(x)
        out = self.act_sigmoid(logits)
        return out


In [None]:

# ------------------------------
# Training & Evaluation Functions
# ------------------------------
def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    for batch in dataloader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [None]:


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1


In [None]:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Read dataset (must have "text" and "label" columns)
    df = pd.read_csv("two_phase_back_translation.csv")
    df

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,label,original_tweet,en_tweet,hebrew_tweet,en_to_id_bt_tweet,he_bt_en_tweet,fully_bt_tweet
0,0,0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...","As for Jaitiru, the police can't handle the go...","באשר לג'אטירו, המשטרה לא יכולה להתמודד עם המוש...","Adapun Jaitiru, polisi tidak bisa menangani gu...","As for Jatiro, the police can't deal with the ...","Sedangkan Jatiro, polisi tidak bisa berurusan ..."
1,1,1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi...","You know, it's supposed to be the girls who ha...","אתה יודע, זה אמור להיות הבנות מי צריך להיות עס...","Kau tahu, itu seharusnya gadis-gadis yang haru...","You know, it's supposed to be the girls who ha...","Kau tahu, itu seharusnya gadis-gadis yang haru..."
2,2,2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,"Amad Photo from Google, deliberately, so that ...","Amad Photo from Google, בכוונה, כך שחברים יכול...","Amad Photo dari Google, sengaja, sehingga tema...","Amad Photo from Google, on purpose, so that fr...","Amad Photo dari Google, sengaja, sehingga tema..."
3,3,3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...","Jatararu Street, part of Mr. Brother's territo...","Jatararu Street, part of Mr. Brother's greatme...","Jataru Street, bagian dari wilayah Mr. Brother...","Jatararu Street, part of Mr. Brother's Greatme...","Jalan Jatararu, bagian dari Greatment Mr Broth..."
4,4,4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata...","Share the experience, yesterday at 6: 00 p.m.,...","לחלוק את החוויה, אתמול בשעה 6: 00 בערב, ביטול ...","Berbagi pengalaman, kemarin pukul 6.00 sore, m...","Share the experience, yesterday at 6:00 p.m., ...","Berbagi pengalaman, kemarin pukul 6: 00 pm, me..."
...,...,...,...,...,...,...,...,...,...
4396,4396,4396,love,"Tahukah kamu, bahwa saat itu papa memejamkan m...",Did you know that at that moment I closed her ...,ידעת שבאותו רגע עצמתי את עיניה ועצרתי את המהומ...,Apakah kau tahu bahwa pada saat itu aku menutu...,Did you know that at that moment I closed her ...,Apakah Anda tahu bahwa pada saat itu aku memej...
4397,4397,4397,fear,Sulitnya menetapkan Calon Wapresnya Jokowi di ...,The difficulty of assigning Vice President Jok...,הקושי של הקצאת הבחירות של סגן הנשיא ג'וקובי בש...,Kesulitan dalam menetapkan pemilihan Wakil Pre...,The difficulty of assigning the election of Vi...,Kesulitan untuk menetapkan pemilihan Wapres Ja...
4398,4398,4398,anger,"5. masa depannya nggak jelas. lha iya, gimana ...",Five. Her future's not clear. Yeah.,חמש... העתיד שלה לא ברור.,"5 tahun, masa depannya tidak jelas.",Five... Her future is uncertain.,Lima... Masa depannya tidak pasti.
4399,4399,4399,happy,[USERNAME] dulu beneran ada mahasiswa Teknik U...,[USERNAME] There was actually a student of UI ...,היה למעשה תלמיד של טכנולוגיות UI מי ירה חברתו ...,Sebenarnya ada seorang mahasiswa UI Techniques...,There was actually a student of UI technologie...,Sebenarnya ada seorang mahasiswa teknologi UI ...


In [None]:

    df["label"] = df["label"].map(LABEL2INDEX)

    texts = df["hebrew_tweet"].tolist()
    labels = df["label"].tolist()

    dataset = TweetDataset(texts, labels, tokenizer)


In [None]:
    # Split into train/val (80/20)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
    # Model, Loss, Optimizer
    model = IndoBertClassifier().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
    # Training Loop
    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"  Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
        print(f"  Val   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

    # Save Model
    os.makedirs("saved_model", exist_ok=True)
    torch.save(model.state_dict(), "saved_model/indobert_tweet_classifier_2p_bt.pt")
    print("Model saved to saved_model/indobert_tweet_classifier_2p_bt.pt")


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch 1/25
  Train Loss: 1.5721 | Acc: 0.3210 | F1: 0.2166
  Val   Loss: 1.5616 | Acc: 0.3087 | F1: 0.1663
Epoch 2/25
  Train Loss: 1.5078 | Acc: 0.3926 | F1: 0.2963
  Val   Loss: 1.5173 | Acc: 0.3882 | F1: 0.3191
Epoch 3/25
  Train Loss: 1.4468 | Acc: 0.4707 | F1: 0.3937
  Val   Loss: 1.4932 | Acc: 0.3802 | F1: 0.3172
Epoch 4/25
  Train Loss: 1.3836 | Acc: 0.4972 | F1: 0.4200
  Val   Loss: 1.4690 | Acc: 0.4654 | F1: 0.4286
Epoch 5/25
  Train Loss: 1.3239 | Acc: 0.5307 | F1: 0.4541
  Val   Loss: 1.4724 | Acc: 0.4518 | F1: 0.4156
Epoch 6/25
  Train Loss: 1.2841 | Acc: 0.5409 | F1: 0.4689
  Val   Loss: 1.4508 | Acc: 0.4154 | F1: 0.3781
Epoch 7/25
  Train Loss: 1.2477 | Acc: 0.5517 | F1: 0.4817
  Val   Loss: 1.4548 | Acc: 0.3939 | F1: 0.3574
Epoch 8/25
  Train Loss: 1.2231 | Acc: 0.5645 | F1: 0.5018
  Val   Loss: 1.4460 | Acc: 0.3859 | F1: 0.3595
Epoch 9/25
  Train Loss: 1.1964 | Acc: 0.6003 | F1: 0.5460
  Val   Loss: 1.4342 | Acc: 0.4291 | F1: 0.4122
Epoch 10/25
  Train Loss: 1.1651 | Ac