In [1]:
!pip -q install transformers accelerate datasets scikit-learn torch pandas numpy

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === File paths ===
en_train_file = 'datasets/pfc_train.csv'
en_test_file = 'datasets/pfc_test.csv'
fil_train_file = 'datasets/fil_train.csv'  # unlabeled data
fil_test_file = 'datasets/fil_test_1000.csv'

In [4]:
# === Load Data ===
# en_train_df = pd.read_csv(en_train_file)
# en_test_df = pd.read_csv(en_test_file)
# fil_train_df = pd.read_csv(fil_train_file)
# fil_test_df = pd.read_csv(fil_test_file)

en_train_df = pd.read_csv(en_train_file).sample(n=40, random_state=42)
en_test_df = pd.read_csv(en_test_file).sample(n=10, random_state=42)
fil_train_df = pd.read_csv(fil_train_file).sample(n=40, random_state=42)
fil_test_df = pd.read_csv(fil_test_file).sample(n=10, random_state=42)

In [5]:
# === Label cleanup ===
en_train_df["label"] = en_train_df["label"].astype(int)
fil_test_df["label"] = fil_test_df["label"].astype(int)
en_test_df["label"] = en_test_df["label"].astype(int)

In [6]:
en_train_df

Unnamed: 0,text,code_frames,label
6779,DOH asks private sector not to procure bivalen...,2,1
9708,The Philippines will continue to engage with c...,14,13
7590,6 domestic flights canceled due to inclement w...,15,14
6449,The Department of Justice (DOJ) is planning to...,7,6
518,"MANILA, Philippines – Just like in previous ye...",6,5
3383,MANILA – approval of the bill reinstituting a...,6,5
6797,Water service interruption is expected to be e...,10,9
3317,Media giant GMA Network is set to proudly repr...,15,14
3894,"MANILA – Municipal fishermen, fishpond owners,...",12,11
7166,"Galvez says Cagayan, CamSur open to having EDC...",8,7


In [7]:
en_test_df

Unnamed: 0,text,code_frames,label
756,Lawmaker slams cops for inconsistent stories i...,7,6
642,The Philippines on Friday called on China anew...,14,13
2402,"MANILA, Philippines – Doctors “doctored” or fa...",7,6
1944,Justice Secretary Jesus Crispin Remulla on Tue...,7,6
252,MANILA – Senators on Tuesday President Ferdin...,5,4
353,Sandiganbayan affirms denial of ex-DBM exec's ...,7,6
1316,Driver of AUV in Salilig case found dead in Ta...,7,6
1642,MANILA – The country’s daily average of new co...,9,8
237,"MANILA, Philippines – The mothers of disappea...",7,6
1950,Senator Joseph Victor “JV” Ejercito on Tuesday...,6,5


In [8]:
fil_train_df

Unnamed: 0,text,label
16522,"‘Best in taga-sharon?’ Backpack ng netizen, lu...",
4307,Libreng condom bubuhos sa France,
3353,Sinugod ng Chinese fighter jet ang ilang US na...,
7038,DAPAT umanong imbestigahan si Vice President L...,
8288,TINAWAG na anti-poor ni Senator Sherwin Gatcha...,
2214,INAMIN ng digital solutions provider na GCash ...,
17969,Dismayado ang batikang aktres na si Aiko Melen...,
1087,Todas ang isang magkapatid matapos silang maba...,
11048,Naramdaman ang magnitude 3.2 na lindol sa Buti...,
3649,"Kalidad ng serbisyo sa Malabon, binago ni Sand...",


In [9]:
fil_test_df

Unnamed: 0,text,code-frame,label
521,NAREKOBER ng mga otoridad ang bangkay ng ginan...,"7. Law and Order, Crime and Justice",6
737,"Minadaling importasyon ng asukal, kinuwestiyon...",12. Public Opinion,11
740,Lalaking tinulungan pa rin asong nalaglag sa p...,15. Other,14
660,Mga dumalo sa UniTeam rally nawalan ng cellpho...,"7. Law and Order, Crime and Justice",6
411,Telcos wala nang dahilan ngayon para hindi map...,13. Political,12
678,"Sa hangaring maabot ang mas maraming Pilipino,...",11. Cultural Identity,10
626,"China Telecom, ikatlong telco?",2. Capacity and Resources,1
513,Mayroong 157 flights ang naka-schedule na uma...,15. Other,14
859,Isinailalim sa sa state of calamity ng Sanggun...,9. Health and Safety,8
136,Mga Fil-Am rumampa sa protesta vs hate crime,4. Fairness and Equality,3


In [36]:
# === Config ===

# Choose model: 'bert-base-multilingual-cased' or 'xlm-roberta-base'
model_name = "bert-base-multilingual-cased" # or "xlm-roberta-base"

# Training hyperparameters
num_labels = 15
max_length = 256
learning_rate = 2e-5
train_batch_size = 32
eval_batch_size = 32
num_epochs = 3
weight_decay = 0.01

# For self-training
num_top_k = 5
num_self_train = 2

# For adversarial training
adv_training = True

In [11]:
# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
def preprocess(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

In [13]:
# === Metrics ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = np.array(labels)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
        "rmse": np.sqrt(mean_squared_error(labels, preds))
    }

In [14]:
def get_embedding_name(model_name):
    if 'roberta' in model_name:
        return 'roberta.embeddings.word_embeddings'
    elif 'bert' in model_name:
        return 'bert.embeddings.word_embeddings'
    else:
        raise ValueError(f"Unsupported model architecture in: {model_name}")

In [31]:
# === Trainer with Optional Adversarial ===
class CustomTrainer(Trainer):
    def __init__(self, *args, adv_training=False, epsilon=1.0, emb_name=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.adv_training = adv_training
        self.epsilon = epsilon
        self.emb_name = emb_name
        self.backup = {}

        if self.adv_training and self.emb_name is None:
            raise ValueError("Embedding layer name (`emb_name`) must be provided when adversarial training is enabled.")

    def attack(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    param.data.add_(self.epsilon * param.grad / norm)

    def restore(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                assert name in self.backup, f"{name} not found in backup during restore"
                param.data = self.backup[name]
        self.backup = {}

    def training_step(self, model, inputs, loss_fn=None):
        loss = super().training_step(model, inputs, loss_fn)

        if self.adv_training:
            self.attack(model)
            adv_loss = super().training_step(model, inputs, loss_fn)
            self.restore(model)
            loss += adv_loss

        return loss

In [32]:
def sort(train_dataset, unlabeled_dataset, logits, label_list, num_k):
    probs = F.softmax(torch.tensor(logits), dim=-1)
    confidences, pseudo_labels = torch.max(probs, dim=-1)

    label2indices = {label: [] for label in label_list}
    for idx, (pred, conf) in enumerate(zip(pseudo_labels, confidences)):
        label2indices[pred.item()].append((idx, conf.item()))

    selected_indices = []
    print("Pseudo-labeled instance count per class:")
    for label in label_list:
        candidates = label2indices[label]
        if not candidates:
            print(f"Class {label}: No confident instances")
            continue
        sorted_indices = sorted(candidates, key=lambda x: x[1], reverse=True)
        top_k = sorted_indices[:num_k]
        selected_indices.extend(idx for idx, _ in top_k)
        top_confidences = [conf for _, conf in top_k]
        min_conf = min(top_confidences)
        max_conf = max(top_confidences)
        print(f"Class {label}: {len(top_k)} instances selected (out of {len(candidates)}), Confidence range: {min_conf:.4f}–{max_conf:.4f}")

    selected = [unlabeled_dataset[i].copy() for i in selected_indices]
    for i, ex in zip(selected_indices, selected):
        ex["label"] = int(pseudo_labels[i])

    remaining_unlabeled = [unlabeled_dataset[i] for i in range(len(unlabeled_dataset)) if i not in selected_indices]
    updated_train = train_dataset + selected
    return updated_train, remaining_unlabeled


In [33]:
# === Prediction + Pseudo-Label Selection ===
def predict(trainer, train_dataset, unlabeled_dataset, label_list, num_k):
    predictions = trainer.predict(unlabeled_dataset)
    logits = predictions.predictions

    updated_train, remaining_unlabeled = sort(train_dataset, unlabeled_dataset, logits, label_list, num_k)

    return updated_train, remaining_unlabeled

In [None]:
# === Main Training Loop ===
def self_training_loop():
    train_dataset = Dataset.from_pandas(en_train_df).map(preprocess, batched=True).to_list()
    unlabeled_dataset = Dataset.from_pandas(fil_train_df.drop(columns=["label"], errors="ignore")).map(preprocess, batched=True).to_list()
    val_dataset = Dataset.from_pandas(fil_test_df).map(preprocess, batched=True)
    label_list = list(range(num_labels))

    best_f1 = -1.0
    best_iteration = -1
    best_epoch = -1

    for i in range(num_self_train):
        print(f"\nSELF-LEARNING ITERATION {i + 1}/{num_self_train}")

        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        output_dir = f"./results/sl_adv/{model_name.replace('/', '_')}_{i}" if adv_training else f"./results/sl/{model_name.replace('/', '_')}_{i}"

        training_args = TrainingArguments(
            output_dir=output_dir,
            eval_strategy="epoch",
            save_strategy="no",
            logging_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            # load_best_model_at_end=True,
            # metric_for_best_model="f1",
            # greater_is_better=True,
            # save_total_limit=1,
            report_to="none"
        )

        # Prepare training dataset for this iteration
        train_ds = Dataset.from_list(train_dataset).map(preprocess, batched=True)

        # Detect embedding name only if adversarial training is on
        embedding_name = get_embedding_name(model_name) if adv_training else None

        # Create trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            adv_training=adv_training,
            emb_name=embedding_name
        )

        trainer.train()

        # Run pseudo-labeling only if more training rounds are left
        if i < num_self_train - 1:
            train_dataset, unlabeled_dataset = predict(
                trainer, train_dataset, unlabeled_dataset, label_list, num_top_k
            )
            print(f"\nTraining set size after this round: {len(train_dataset)}")
            print(f"Remaining unlabeled examples after this round: {len(unlabeled_dataset)}")

        # Evaluate current model on validation set
        predictions = trainer.predict(val_dataset)
        logits = predictions.predictions
        labels = predictions.label_ids if predictions.label_ids is not None else np.argmax(logits, axis=1)
        metrics = compute_metrics((logits, labels))

        print(f"Metrics: {metrics}")

        if metrics["f1"] > best_f1:
            best_f1 = metrics["f1"]
            best_iteration = i + 1
            best_epoch = trainer.state.epoch
            print(f"New best model found on iteration {best_iteration}, epoch {best_epoch:.1f} with F1 = {best_f1:.4f}")

             # === Save full model (weights + config + tokenizer)
            model_to_save = trainer.model.module if hasattr(trainer.model, "module") else trainer.model
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            print(f"Saved full best model and tokenizer to: {output_dir}")

    print(f"\nBest overall model was from iteration {best_iteration}, epoch {best_epoch:.1f} with F1 = {best_f1:.4f}")


In [41]:
if __name__ == "__main__":
    self_training_loop()

Map: 100%|██████████| 40/40 [00:00<00:00, 1138.76 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 8590.48 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 561.15 examples/s]



SELF-LEARNING ITERATION 1/2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 40/40 [00:00<00:00, 3289.71 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,5.4338,2.686733,0.1,0.03125,8.024961
2,5.2775,2.652806,0.2,0.085714,6.074537


: 