## **Importing Resources**

In [64]:
!pip -q install transformers accelerate datasets scikit-learn torch pandas numpy

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import random
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.metrics import classification_report

## **Loading Datasets**

In [None]:
en_train_file = 'datasets/final/mpfc_train.csv'
en_test_file = 'datasets/final/mpfc_test.csv'
fil_train_file = 'datasets/final/fil_train.csv'
fil_test_file = 'datasets/final/fil_test.csv'

In [None]:
en_train_df = pd.read_csv(en_train_file)
en_test_df = pd.read_csv(en_test_file)
fil_train_df = pd.read_csv(fil_train_file)
fil_test_df = pd.read_csv(fil_test_file)

In [None]:
en_train_df['label'] = en_train_df['label'].astype(int)
fil_test_df['label'] = fil_test_df['label'].astype(int)
en_test_df['label'] = en_test_df['label'].astype(int)

In [69]:
en_train_df

Unnamed: 0,text,code_frames,label
0,Senator Sherwin Gatchalian filed a civil lawsu...,7,6
1,AVOID COLLATERAL DAMAGE FROM NRA'S CAMPAIGN,15,14
2,MANILA – Human immunodeficiency virus (HIV) in...,9,8
3,"MANILA, Philippines – President Ferdinand Marc...",2,1
4,Japanese Embassy in PH thanks DOJ over deporta...,14,13
...,...,...,...
19995,MANILA – President Ferdinand R. Marcos Jr. on ...,2,1
19996,"Do you have a question on the news - local, na...",12,11
19997,Davao Oriental 2nd district Rep. Cheeno Almari...,13,12
19998,The Philippine government is eyeing to deport ...,7,6


In [70]:
en_test_df

Unnamed: 0,text,code_frames,label
0,Physical distancing in classrooms may be eased...,10,9
1,Bishop took on sensitive social issues\r\n,3,2
2,MANILA – President Ferdinand R. Marcos Jr. has...,14,13
3,Florida voters strongly support an increase in...,12,11
4,The Supreme Court had approved new state death...,5,4
...,...,...,...
4995,"Telemachus 'Tel' Orfanos, 27, survived mass sh...",10,9
4996,"""Open Carry Picnic"" -- a mix of a typical outd...",12,11
4997,ASEAN first: Philippine presidents and their s...,13,12
4998,The Bureau of Immigration (BI) has stopped ano...,7,6


In [71]:
fil_train_df

Unnamed: 0,text,label
0,"Sa 110, missing 33 pa",
1,Masarap talagang chumibog ng malalamig na pagk...,
2,Todas sa sama ng panahon 43 na — NDRRMC,
3,Mga gov’t worker may tig-20K bonus pa,
4,3 nirapido ng ‘Bonnet Gang’ sa kotse,
...,...,...
19993,"Kabataan Partylist, nakiisa sa kilos-protesta ...",
19994,"DOH, nagbabala sa publiko vs karaniwang sakit ...",
19995,Tuloy ang transport strike sa Marso 6 hanggang...,
19996,"Barko sa Palawan, nasunog, lumubog; 2 tripulan...",


In [72]:
fil_test_df

Unnamed: 0,text,code_frames,label
0,Isang umano’y tinaguriang ‘shabu queen’ at lid...,7,6
1,Anthrax infection kumalat sa Cagayan,9,8
2,TESDA: Mga tech-voc graduate swak sa trabaho,10,9
3,Nagkamit ng unang pwesto ang isang Filipina st...,15,14
4,NEDA inaprub tapyas taripa sa e-vehicle,6,5
...,...,...,...
4979,"Typhoon Betty, patuloy na humihina sa karagata...",9,8
4980,'Sarap maging tatay!' Post ng netizen tungkol ...,11,10
4981,"TESDA, maglulunsad ng training programs para s...",2,1
4982,"Anne Curtis, nagdiwang ng kaarawan sa ‘It’s Sh...",15,14


## **Preparing Training Setup**

In [None]:
# Choose model: 'bert-base-multilingual-cased' or 'xlm-roberta-base'
model_name = 'xlm-roberta-base'

# Training hyperparameters
num_labels = 15
max_length = 128
learning_rate = 2e-5
train_batch_size = 32
eval_batch_size = 32
num_epochs = 3
weight_decay = 0.01

# For self-training
num_top_k = 40
num_self_train = 3

# For adversarial training
adv_training = True

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [75]:
def preprocess(example):
    return tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = np.array(labels)

    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'rmse': np.sqrt(mean_squared_error(labels, preds))
    }

In [77]:
def get_embedding_name(model_name):
    if 'roberta' in model_name:
        return 'roberta.embeddings.word_embeddings'
    elif 'bert' in model_name:
        return 'bert.embeddings.word_embeddings'
    else:
        raise ValueError(f"Unsupported model architecture in: {model_name}")

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, adv_training=False, epsilon=1.0, emb_name=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.adv_training = adv_training
        self.epsilon = epsilon
        self.emb_name = emb_name
        self.backup = {}

        if self.adv_training and self.emb_name is None:
            raise ValueError('Embedding layer name (`emb_name`) must be provided when adversarial training is enabled.')

    def attack(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    param.data.add_(self.epsilon * param.grad / norm)

    def restore(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                assert name in self.backup, f"{name} not found in backup during restore"
                param.data = self.backup[name]
        self.backup = {}

    def training_step(self, model, inputs, loss_fn=None):
        loss = super().training_step(model, inputs, loss_fn)

        if self.adv_training:
            self.attack(model)
            adv_loss = super().training_step(model, inputs, loss_fn)
            self.restore(model)
            loss += adv_loss

        return loss

In [79]:
def sort(train_dataset, unlabeled_dataset, logits, label_list, num_k):
    probs = F.softmax(torch.tensor(logits), dim=-1)
    confidences, pseudo_labels = torch.max(probs, dim=-1)

    label2indices = {label: [] for label in label_list}
    for idx, (pred, conf) in enumerate(zip(pseudo_labels, confidences)):
        label2indices[pred.item()].append((idx, conf.item()))

    selected_indices = []
    print('Pseudo-labeled instance count per class:')
    for label in label_list:
        candidates = label2indices[label]
        if not candidates:
            print(f"Class {label}: No confident instances")
            continue
        sorted_indices = sorted(candidates, key=lambda x: x[1], reverse=True)
        top_k = sorted_indices[:num_k]
        selected_indices.extend(idx for idx, _ in top_k)
        top_confidences = [conf for _, conf in top_k]
        min_conf = min(top_confidences)
        max_conf = max(top_confidences)
        print(f"Class {label}: {len(top_k)} instances selected (out of {len(candidates)}), Confidence range: {min_conf:.4f}–{max_conf:.4f}")

    selected = [unlabeled_dataset[i].copy() for i in selected_indices]
    for i, ex in zip(selected_indices, selected):
        ex['label'] = int(pseudo_labels[i])

    remaining_unlabeled = [unlabeled_dataset[i] for i in range(len(unlabeled_dataset)) if i not in selected_indices]
    updated_train = train_dataset + selected
    return updated_train, remaining_unlabeled


In [None]:
def predict(trainer, train_dataset, unlabeled_dataset, label_list, num_k):
    predictions = trainer.predict(unlabeled_dataset)
    logits = predictions.predictions

    updated_train, remaining_unlabeled = sort(train_dataset, unlabeled_dataset, logits, label_list, num_k)

    return updated_train, remaining_unlabeled

In [81]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
def self_training_loop():
    
    set_seed(42)
    
    train_dataset = Dataset.from_pandas(en_train_df).map(preprocess, batched=True).to_list()
    unlabeled_dataset = Dataset.from_pandas(fil_train_df.drop(columns=['label'], errors='ignore')).map(preprocess, batched=True).to_list()
    fil_val_dataset = Dataset.from_pandas(fil_test_df).map(preprocess, batched=True)
    en_val_dataset = Dataset.from_pandas(en_test_df).map(preprocess, batched=True)

    label_list = list(range(num_labels))

    best_f1 = -1.0
    best_iteration = -1
    best_epoch = -1

    for i in range(num_self_train):
        print(f"\nSELF-LEARNING ITERATION {i + 1}/{num_self_train}")

        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        base_output_dir = f"./results/sl_adv/{model_name.replace('/', '_')}" if adv_training else f"./results/sl/{model_name.replace('/', '_')}"
        iter_output_dir = f"{base_output_dir}/iter_{i+1}"

        training_args = TrainingArguments(
            output_dir=iter_output_dir,
            eval_strategy='epoch',            
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=learning_rate,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            greater_is_better=True,
            save_total_limit=1,
            report_to='none',
            seed=42,
        )

        # Prepare training dataset for this iteration
        train_ds = Dataset.from_list(train_dataset).map(preprocess, batched=True)

        # Detect embedding name only if adversarial training is on
        embedding_name = get_embedding_name(model_name) if adv_training else None

        # Create trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=fil_val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            adv_training=adv_training,
            emb_name=embedding_name
        )

        trainer.train()

        # Run pseudo-labeling only if more training rounds are left
        if i < num_self_train - 1:
            train_dataset, unlabeled_dataset = predict(
                trainer, train_dataset, unlabeled_dataset, label_list, num_top_k
            )
            print(f"\nTraining set size after this round: {len(train_dataset)}")
            print(f"Remaining unlabeled examples after this round: {len(unlabeled_dataset)}")

        # Evaluate current model on Filipino validation set
        fil_predictions = trainer.predict(fil_val_dataset)
        fil_logits = fil_predictions.predictions
        fil_labels = fil_predictions.label_ids if fil_predictions.label_ids is not None else np.argmax(fil_logits, axis=1)
        fil_metrics = compute_metrics((fil_logits, fil_labels))
        print(f"Metrics on Filipino Validation Set: {fil_metrics}")

        # Evaluate current model on English validation set
        en_predictions = trainer.predict(en_val_dataset)
        en_logits = en_predictions.predictions
        en_labels = en_predictions.label_ids if en_predictions.label_ids is not None else np.argmax(en_logits, axis=1)
        en_metrics = compute_metrics((en_logits, en_labels))
        print(f"Metrics on English validation Set: {en_metrics}")

        if fil_metrics['f1'] > best_f1:
            best_f1 = fil_metrics['f1']
            best_iteration = i + 1
            best_epoch = trainer.state.epoch
            print(f"New best model found on iteration {best_iteration}, epoch {best_epoch:.1f} with F1 = {best_f1:.4f}")

    print(f"\nBest overall model was from iteration {best_iteration}, epoch {best_epoch:.1f} with F1 = {best_f1:.4f}")


## **Self-learning**

### mBERT

In [23]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:00<00:00, 20095.54 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 21045.70 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 14318.86 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 24776.26 examples/s]



SELF-LEARNING ITERATION 1/3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 24881.33 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6231,1.836078,0.434791,0.44425,4.435375
2,1.0082,1.965006,0.421148,0.439205,4.353164
3,0.7599,1.953064,0.441011,0.458741,4.232366


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 791), Confidence range: 0.9181–0.9498
Class 1: 40 instances selected (out of 1616), Confidence range: 0.8163–0.8618
Class 2: 40 instances selected (out of 371), Confidence range: 0.9357–0.9631
Class 3: 40 instances selected (out of 79), Confidence range: 0.5519–0.9036
Class 4: 40 instances selected (out of 143), Confidence range: 0.7109–0.9436
Class 5: 40 instances selected (out of 769), Confidence range: 0.9213–0.9352
Class 6: 40 instances selected (out of 2557), Confidence range: 0.9771–0.9798
Class 7: 40 instances selected (out of 584), Confidence range: 0.9017–0.9551
Class 8: 40 instances selected (out of 2159), Confidence range: 0.9700–0.9721
Class 9: 40 instances selected (out of 3066), Confidence range: 0.8990–0.9256
Class 10: 40 instances selected (out of 374), Confidence range: 0.6817–0.8714
Class 11: 40 instances selected (out of 223), Confidence range: 0.7922–0.9358
Class 12: 40 instances selecte

Metrics on Filipino Validation Set: {'accuracy': 0.4410112359550562, 'f1': 0.4587407732423319, 'rmse': np.float64(4.232365927978654)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689478321905268, 'rmse': np.float64(3.421841609426129)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4587

SELF-LEARNING ITERATION 2/3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:01<00:00, 18227.43 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5452,2.062803,0.368579,0.390707,4.268924
2,0.9664,2.140092,0.393459,0.424088,4.195895
3,0.7347,2.221549,0.388644,0.419401,4.130083


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1052), Confidence range: 0.9364–0.9512
Class 1: 40 instances selected (out of 1414), Confidence range: 0.8041–0.8723
Class 2: 40 instances selected (out of 294), Confidence range: 0.9271–0.9550
Class 3: 40 instances selected (out of 234), Confidence range: 0.6925–0.9077
Class 4: 40 instances selected (out of 628), Confidence range: 0.8531–0.9388
Class 5: 40 instances selected (out of 1218), Confidence range: 0.8867–0.9269
Class 6: 40 instances selected (out of 2029), Confidence range: 0.9681–0.9728
Class 7: 40 instances selected (out of 647), Confidence range: 0.9143–0.9457
Class 8: 40 instances selected (out of 2067), Confidence range: 0.9657–0.9686
Class 9: 40 instances selected (out of 2677), Confidence range: 0.8742–0.9075
Class 10: 40 instances selected (out of 1396), Confidence range: 0.8662–0.9031
Class 11: 40 instances selected (out of 333), Confidence range: 0.8563–0.9255
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.39345906902086675, 'f1': 0.42408771241429094, 'rmse': np.float64(4.1958953419629355)}


Metrics on English validation Set: {'accuracy': 0.6866, 'f1': 0.6855292050832662, 'rmse': np.float64(3.4478108996869303)}

SELF-LEARNING ITERATION 3/3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:01<00:00, 18308.04 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5347,2.241271,0.342295,0.372288,4.415358
2,0.9338,2.334151,0.35634,0.386868,4.197736
3,0.7,2.359192,0.364767,0.398668,4.170065


Metrics on Filipino Validation Set: {'accuracy': 0.3647672552166934, 'f1': 0.3986676154162394, 'rmse': np.float64(4.170065494441229)}


Metrics on English validation Set: {'accuracy': 0.696, 'f1': 0.694937290834445, 'rmse': np.float64(3.3450859480736814)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4587


### XLM-RoBERTa

In [63]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:00<00:00, 26102.76 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 24817.46 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 28865.52 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 30312.45 examples/s]



SELF-LEARNING ITERATION 1/3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 26001.68 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6715,1.798217,0.442817,0.445454,4.006816
2,1.0373,1.813685,0.461878,0.471468,3.837228
3,0.8282,1.788131,0.472311,0.48153,3.831629


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1241), Confidence range: 0.9714–0.9769
Class 1: 40 instances selected (out of 1044), Confidence range: 0.8541–0.8987
Class 2: 40 instances selected (out of 342), Confidence range: 0.9642–0.9768
Class 3: 40 instances selected (out of 109), Confidence range: 0.5864–0.9213
Class 4: 40 instances selected (out of 156), Confidence range: 0.6945–0.9417
Class 5: 40 instances selected (out of 1406), Confidence range: 0.9328–0.9515
Class 6: 40 instances selected (out of 3998), Confidence range: 0.9737–0.9788
Class 7: 40 instances selected (out of 402), Confidence range: 0.9127–0.9682
Class 8: 40 instances selected (out of 2534), Confidence range: 0.9802–0.9820
Class 9: 40 instances selected (out of 2799), Confidence range: 0.9049–0.9405
Class 10: 40 instances selected (out of 990), Confidence range: 0.8950–0.9525
Class 11: 40 instances selected (out of 407), Confidence range: 0.9395–0.9684
Class 12: 40 instances sele

Metrics on Filipino Validation Set: {'accuracy': 0.47231139646869985, 'f1': 0.4815300293121325, 'rmse': np.float64(3.831628950894838)}


Metrics on English validation Set: {'accuracy': 0.704, 'f1': 0.7018077952406384, 'rmse': np.float64(3.2116662342155045)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4815

SELF-LEARNING ITERATION 2/3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:00<00:00, 31214.00 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6275,1.884882,0.410915,0.41079,3.933435
2,0.993,1.835614,0.454655,0.472729,3.895376
3,0.7918,1.861039,0.451846,0.465383,3.841644


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1013), Confidence range: 0.9654–0.9778
Class 1: 40 instances selected (out of 1136), Confidence range: 0.8611–0.9114
Class 2: 40 instances selected (out of 325), Confidence range: 0.9546–0.9721
Class 3: 40 instances selected (out of 186), Confidence range: 0.6024–0.8615
Class 4: 40 instances selected (out of 351), Confidence range: 0.7387–0.9404
Class 5: 40 instances selected (out of 1265), Confidence range: 0.9014–0.9267
Class 6: 40 instances selected (out of 3453), Confidence range: 0.9648–0.9745
Class 7: 40 instances selected (out of 521), Confidence range: 0.8825–0.9606
Class 8: 40 instances selected (out of 2024), Confidence range: 0.9783–0.9803
Class 9: 40 instances selected (out of 2692), Confidence range: 0.8884–0.9184
Class 10: 40 instances selected (out of 1300), Confidence range: 0.8318–0.8978
Class 11: 40 instances selected (out of 420), Confidence range: 0.8935–0.9520
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.4546548956661316, 'f1': 0.47272871500234676, 'rmse': np.float64(3.8953763183184513)}


Metrics on English validation Set: {'accuracy': 0.7024, 'f1': 0.7004673955561037, 'rmse': np.float64(3.2506922339710966)}

SELF-LEARNING ITERATION 3/3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:00<00:00, 30812.17 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6159,1.996241,0.415931,0.421244,3.885268
2,0.9764,1.875511,0.450642,0.467962,3.780998
3,0.7693,1.958302,0.449037,0.466052,3.837803


Metrics on Filipino Validation Set: {'accuracy': 0.45064205457463885, 'f1': 0.46796242211360306, 'rmse': np.float64(3.7809981526704814)}


Metrics on English validation Set: {'accuracy': 0.6986, 'f1': 0.6978094985033408, 'rmse': np.float64(3.2878260294608044)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4815


## **Self-learning with Adversarial**

### mBERT

In [43]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:00<00:00, 24236.53 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 21063.76 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 24431.12 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 15261.86 examples/s]



SELF-LEARNING ITERATION 1/3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 24970.20 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,3.6292,1.714558,0.429976,0.436066,4.363821
2,2.5278,1.786078,0.423957,0.44184,4.372572
3,2.0789,1.715989,0.448836,0.464945,4.264128


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 664), Confidence range: 0.8402–0.9122
Class 1: 40 instances selected (out of 1102), Confidence range: 0.6725–0.7584
Class 2: 40 instances selected (out of 333), Confidence range: 0.9013–0.9498
Class 3: 40 instances selected (out of 75), Confidence range: 0.4044–0.8164
Class 4: 40 instances selected (out of 54), Confidence range: 0.2863–0.8917
Class 5: 40 instances selected (out of 675), Confidence range: 0.8523–0.9033
Class 6: 40 instances selected (out of 3028), Confidence range: 0.9538–0.9648
Class 7: 40 instances selected (out of 392), Confidence range: 0.7884–0.9154
Class 8: 40 instances selected (out of 2321), Confidence range: 0.9580–0.9628
Class 9: 40 instances selected (out of 3020), Confidence range: 0.8513–0.8869
Class 10: 40 instances selected (out of 246), Confidence range: 0.4641–0.7304
Class 11: 40 instances selected (out of 214), Confidence range: 0.6770–0.9053
Class 12: 40 instances selected

Metrics on Filipino Validation Set: {'accuracy': 0.4488362760834671, 'f1': 0.4649451306977881, 'rmse': np.float64(4.264127684734299)}


Metrics on English validation Set: {'accuracy': 0.7052, 'f1': 0.703582213712078, 'rmse': np.float64(3.3456538972224847)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4649

SELF-LEARNING ITERATION 2/3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:01<00:00, 17857.79 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,3.5339,1.982036,0.366974,0.387634,4.302982
2,2.4666,1.958667,0.392055,0.418403,4.236015
3,2.0271,2.000459,0.394061,0.419446,4.207666


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 916), Confidence range: 0.9128–0.9425
Class 1: 40 instances selected (out of 1446), Confidence range: 0.7723–0.8311
Class 2: 40 instances selected (out of 457), Confidence range: 0.9376–0.9543
Class 3: 40 instances selected (out of 470), Confidence range: 0.7122–0.8885
Class 4: 40 instances selected (out of 306), Confidence range: 0.7470–0.9261
Class 5: 40 instances selected (out of 847), Confidence range: 0.8566–0.9136
Class 6: 40 instances selected (out of 2332), Confidence range: 0.9505–0.9605
Class 7: 40 instances selected (out of 738), Confidence range: 0.8780–0.9295
Class 8: 40 instances selected (out of 2456), Confidence range: 0.9574–0.9624
Class 9: 40 instances selected (out of 2683), Confidence range: 0.8891–0.9080
Class 10: 40 instances selected (out of 1451), Confidence range: 0.8913–0.9183
Class 11: 40 instances selected (out of 534), Confidence range: 0.8763–0.9308
Class 12: 40 instances selec

Metrics on Filipino Validation Set: {'accuracy': 0.3940609951845907, 'f1': 0.4194462530252862, 'rmse': np.float64(4.207666129056767)}


Metrics on English validation Set: {'accuracy': 0.698, 'f1': 0.6964769898433307, 'rmse': np.float64(3.4413078909042705)}

SELF-LEARNING ITERATION 3/3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:01<00:00, 20502.40 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,3.4797,2.099655,0.359952,0.386892,4.273692
2,2.3667,2.182828,0.366172,0.395093,4.186393
3,1.9463,2.177252,0.376605,0.408916,4.190943


Metrics on Filipino Validation Set: {'accuracy': 0.3766051364365971, 'f1': 0.40891643099319636, 'rmse': np.float64(4.1909431892366795)}


Metrics on English validation Set: {'accuracy': 0.7102, 'f1': 0.7084034865391744, 'rmse': np.float64(3.3674916480965473)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4649


### XLM-RoBERTa

In [83]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:00<00:00, 25740.09 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 24342.71 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 28604.56 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 31649.25 examples/s]



SELF-LEARNING ITERATION 1/3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 26430.21 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,3.3244,1.71969,0.455257,0.462936,4.08313
2,2.1134,1.715636,0.473315,0.484257,3.845402
3,1.6588,1.725254,0.482544,0.491734,3.807122


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1089), Confidence range: 0.9747–0.9797
Class 1: 40 instances selected (out of 930), Confidence range: 0.8710–0.9229
Class 2: 40 instances selected (out of 351), Confidence range: 0.9610–0.9783
Class 3: 40 instances selected (out of 112), Confidence range: 0.5806–0.9506
Class 4: 40 instances selected (out of 143), Confidence range: 0.6308–0.9549
Class 5: 40 instances selected (out of 1333), Confidence range: 0.9274–0.9561
Class 6: 40 instances selected (out of 4106), Confidence range: 0.9784–0.9837
Class 7: 40 instances selected (out of 454), Confidence range: 0.9232–0.9671
Class 8: 40 instances selected (out of 2437), Confidence range: 0.9826–0.9853
Class 9: 40 instances selected (out of 2703), Confidence range: 0.9067–0.9407
Class 10: 40 instances selected (out of 944), Confidence range: 0.9012–0.9610
Class 11: 40 instances selected (out of 400), Confidence range: 0.9348–0.9718
Class 12: 40 instances selec

Metrics on Filipino Validation Set: {'accuracy': 0.4825441412520064, 'f1': 0.4917339887906747, 'rmse': np.float64(3.807122454087514)}


Metrics on English validation Set: {'accuracy': 0.7252, 'f1': 0.7240784101307837, 'rmse': np.float64(3.0986771371022184)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4917

SELF-LEARNING ITERATION 2/3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:01<00:00, 20375.70 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,3.2712,1.747432,0.445225,0.448992,3.865905
2,2.0523,1.736007,0.470104,0.486185,3.861232
3,1.6281,1.760985,0.471709,0.490147,3.846993


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1175), Confidence range: 0.9710–0.9775
Class 1: 40 instances selected (out of 913), Confidence range: 0.8864–0.9248
Class 2: 40 instances selected (out of 377), Confidence range: 0.9590–0.9767
Class 3: 40 instances selected (out of 227), Confidence range: 0.6578–0.9371
Class 4: 40 instances selected (out of 319), Confidence range: 0.7417–0.9271
Class 5: 40 instances selected (out of 1296), Confidence range: 0.9233–0.9490
Class 6: 40 instances selected (out of 3390), Confidence range: 0.9763–0.9820
Class 7: 40 instances selected (out of 523), Confidence range: 0.8876–0.9628
Class 8: 40 instances selected (out of 2361), Confidence range: 0.9842–0.9872
Class 9: 40 instances selected (out of 2792), Confidence range: 0.8987–0.9301
Class 10: 40 instances selected (out of 1263), Confidence range: 0.9110–0.9524
Class 11: 40 instances selected (out of 497), Confidence range: 0.9291–0.9709
Class 12: 40 instances sele

Metrics on Filipino Validation Set: {'accuracy': 0.4717094703049759, 'f1': 0.49014685382412704, 'rmse': np.float64(3.8469933643594136)}


Metrics on English validation Set: {'accuracy': 0.7182, 'f1': 0.7170546294311572, 'rmse': np.float64(3.12717124571073)}

SELF-LEARNING ITERATION 3/3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:00<00:00, 29554.20 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,3.2123,1.855223,0.434189,0.452225,3.824029
2,1.9788,1.842061,0.450241,0.469214,3.79067
3,1.5589,1.873778,0.455859,0.477448,3.872647


Metrics on Filipino Validation Set: {'accuracy': 0.45585874799357945, 'f1': 0.4774484475510348, 'rmse': np.float64(3.8726465954551714)}


Metrics on English validation Set: {'accuracy': 0.7226, 'f1': 0.7211250310867204, 'rmse': np.float64(3.0958036113422955)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4917


## **Investigating the Results**

In [None]:
model_path = './results/sl_adv/xlm-roberta-base/iter_1/checkpoint-1875'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [94]:
fil_val_dataset = Dataset.from_pandas(fil_test_df).map(preprocess, batched=True)

Map: 100%|██████████| 4984/4984 [00:00<00:00, 25649.99 examples/s]


In [None]:
trainer = Trainer(model=model, tokenizer=tokenizer)

predictions = trainer.predict(fil_val_dataset)
logits = predictions.predictions
true_labels = predictions.label_ids

  trainer = Trainer(model=model, tokenizer=tokenizer)


In [97]:
predicted_labels = np.argmax(logits, axis=1)

report = classification_report(true_labels, predicted_labels, digits=4)
print(report)

              precision    recall  f1-score   support

           0     0.4583    0.6790    0.5473       162
           1     0.4957    0.3239    0.3918       352
           2     0.3947    0.4615    0.4255        65
           3     0.1724    0.1111    0.1351        45
           4     0.2632    0.1370    0.1802        73
           5     0.3578    0.4500    0.3986       260
           6     0.7113    0.7120    0.7116      1104
           7     0.3217    0.3007    0.3108       153
           8     0.7734    0.4839    0.5953      1023
           9     0.1672    0.4685    0.2464       222
          10     0.1787    0.2979    0.2234       141
          11     0.3304    0.4634    0.3858        82
          12     0.3045    0.6512    0.4150       281
          13     0.4500    0.8182    0.5806        99
          14     0.6055    0.2646    0.3683       922

    accuracy                         0.4825      4984
   macro avg     0.3990    0.4415    0.3944      4984
weighted avg     0.5614   

In [None]:
probs = F.softmax(torch.tensor(logits), dim=-1)
confidence_scores = torch.max(probs, dim=1).values.numpy()

eval_df = fil_val_dataset.to_pandas()

eval_df['predicted_label'] = predicted_labels
eval_df['match'] = eval_df['label'] == eval_df['predicted_label']
eval_df['confidence'] = confidence_scores
eval_df = eval_df.drop(columns=['code_frames', 'input_ids', 'attention_mask'])

eval_df

Unnamed: 0,text,label,predicted_label,match,confidence
0,Isang umano’y tinaguriang ‘shabu queen’ at lid...,6,6,True,0.981036
1,Anthrax infection kumalat sa Cagayan,8,8,True,0.967823
2,TESDA: Mga tech-voc graduate swak sa trabaho,9,1,False,0.481570
3,Nagkamit ng unang pwesto ang isang Filipina st...,14,13,False,0.759778
4,NEDA inaprub tapyas taripa sa e-vehicle,5,6,False,0.617909
...,...,...,...,...,...
4979,"Typhoon Betty, patuloy na humihina sa karagata...",8,9,False,0.657265
4980,'Sarap maging tatay!' Post ng netizen tungkol ...,10,11,False,0.927840
4981,"TESDA, maglulunsad ng training programs para s...",1,1,True,0.667123
4982,"Anne Curtis, nagdiwang ng kaarawan sa ‘It’s Sh...",14,10,False,0.615033


In [None]:
eval_df.to_csv('datasets/results/xlm-r_adv1_results.csv')
