## **Importing Resources**

In [61]:
!pip -q install transformers accelerate datasets scikit-learn torch pandas numpy


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import random
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.metrics import classification_report

## **Loading Datasets**

In [None]:
en_train_file = 'datasets/final/mpfc_train.csv'
en_test_file = 'datasets/final/mpfc_test.csv'
fil_train_file = 'datasets/final/fil_train.csv'
fil_test_file = 'datasets/final/fil_test.csv'

In [None]:
en_train_df = pd.read_csv(en_train_file)
en_test_df = pd.read_csv(en_test_file)
fil_train_df = pd.read_csv(fil_train_file)
fil_test_df = pd.read_csv(fil_test_file)

In [None]:
en_train_df['label'] = en_train_df['label'].astype(int)
fil_test_df['label'] = fil_test_df['label'].astype(int)
en_test_df['label'] = en_test_df['label'].astype(int)

In [66]:
en_train_df

Unnamed: 0,text,code_frames,label
0,Senator Sherwin Gatchalian filed a civil lawsu...,7,6
1,AVOID COLLATERAL DAMAGE FROM NRA'S CAMPAIGN,15,14
2,MANILA – Human immunodeficiency virus (HIV) in...,9,8
3,"MANILA, Philippines – President Ferdinand Marc...",2,1
4,Japanese Embassy in PH thanks DOJ over deporta...,14,13
...,...,...,...
19995,MANILA – President Ferdinand R. Marcos Jr. on ...,2,1
19996,"Do you have a question on the news - local, na...",12,11
19997,Davao Oriental 2nd district Rep. Cheeno Almari...,13,12
19998,The Philippine government is eyeing to deport ...,7,6


In [67]:
en_test_df

Unnamed: 0,text,code_frames,label
0,Physical distancing in classrooms may be eased...,10,9
1,Bishop took on sensitive social issues\r\n,3,2
2,MANILA – President Ferdinand R. Marcos Jr. has...,14,13
3,Florida voters strongly support an increase in...,12,11
4,The Supreme Court had approved new state death...,5,4
...,...,...,...
4995,"Telemachus 'Tel' Orfanos, 27, survived mass sh...",10,9
4996,"""Open Carry Picnic"" -- a mix of a typical outd...",12,11
4997,ASEAN first: Philippine presidents and their s...,13,12
4998,The Bureau of Immigration (BI) has stopped ano...,7,6


In [68]:
fil_train_df

Unnamed: 0,text,label
0,"Sa 110, missing 33 pa",
1,Masarap talagang chumibog ng malalamig na pagk...,
2,Todas sa sama ng panahon 43 na — NDRRMC,
3,Mga gov’t worker may tig-20K bonus pa,
4,3 nirapido ng ‘Bonnet Gang’ sa kotse,
...,...,...
19993,"Kabataan Partylist, nakiisa sa kilos-protesta ...",
19994,"DOH, nagbabala sa publiko vs karaniwang sakit ...",
19995,Tuloy ang transport strike sa Marso 6 hanggang...,
19996,"Barko sa Palawan, nasunog, lumubog; 2 tripulan...",


In [69]:
fil_test_df

Unnamed: 0,text,code_frames,label
0,Isang umano’y tinaguriang ‘shabu queen’ at lid...,7,6
1,Anthrax infection kumalat sa Cagayan,9,8
2,TESDA: Mga tech-voc graduate swak sa trabaho,10,9
3,Nagkamit ng unang pwesto ang isang Filipina st...,15,14
4,NEDA inaprub tapyas taripa sa e-vehicle,6,5
...,...,...,...
4979,"Typhoon Betty, patuloy na humihina sa karagata...",9,8
4980,'Sarap maging tatay!' Post ng netizen tungkol ...,11,10
4981,"TESDA, maglulunsad ng training programs para s...",2,1
4982,"Anne Curtis, nagdiwang ng kaarawan sa ‘It’s Sh...",15,14


## **Preparing Training Setup**

In [None]:
# Choose model: 'bert-base-multilingual-cased' or 'xlm-roberta-base'
model_name = 'bert-base-multilingual-cased'

# Training hyperparameters
num_labels = 15
max_length = 128
learning_rate = 2e-5
train_batch_size = 32
eval_batch_size = 32
num_epochs = 3
weight_decay = 0.01

# For self-training
num_top_k = 40
num_self_train = 5

# For adversarial training
adv_training = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [72]:
def preprocess(example):
    return tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = np.array(labels)

    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'rmse': np.sqrt(mean_squared_error(labels, preds))
    }

In [74]:
def get_embedding_name(model_name):
    if 'roberta' in model_name:
        return 'roberta.embeddings.word_embeddings'
    elif 'bert' in model_name:
        return 'bert.embeddings.word_embeddings'
    else:
        raise ValueError(f"Unsupported model architecture in: {model_name}")

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, adv_training=False, epsilon=1.0, emb_name=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.adv_training = adv_training
        self.epsilon = epsilon
        self.emb_name = emb_name
        self.backup = {}

        if self.adv_training and self.emb_name is None:
            raise ValueError('Embedding layer name (`emb_name`) must be provided when adversarial training is enabled.')

    def attack(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    param.data.add_(self.epsilon * param.grad / norm)

    def restore(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                assert name in self.backup, f"{name} not found in backup during restore"
                param.data = self.backup[name]
        self.backup = {}

    def training_step(self, model, inputs, loss_fn=None):
        loss = super().training_step(model, inputs, loss_fn)

        if self.adv_training:
            self.attack(model)
            adv_loss = super().training_step(model, inputs, loss_fn)
            self.restore(model)
            loss += adv_loss

        return loss

In [76]:
def sort(train_dataset, unlabeled_dataset, logits, label_list, num_k):
    probs = F.softmax(torch.tensor(logits), dim=-1)
    confidences, pseudo_labels = torch.max(probs, dim=-1)

    label2indices = {label: [] for label in label_list}
    for idx, (pred, conf) in enumerate(zip(pseudo_labels, confidences)):
        label2indices[pred.item()].append((idx, conf.item()))

    selected_indices = []
    print('Pseudo-labeled instance count per class:')
    for label in label_list:
        candidates = label2indices[label]
        if not candidates:
            print(f"Class {label}: No confident instances")
            continue
        sorted_indices = sorted(candidates, key=lambda x: x[1], reverse=True)
        top_k = sorted_indices[:num_k]
        selected_indices.extend(idx for idx, _ in top_k)
        top_confidences = [conf for _, conf in top_k]
        min_conf = min(top_confidences)
        max_conf = max(top_confidences)
        print(f"Class {label}: {len(top_k)} instances selected (out of {len(candidates)}), Confidence range: {min_conf:.4f}–{max_conf:.4f}")

    selected = [unlabeled_dataset[i].copy() for i in selected_indices]
    for i, ex in zip(selected_indices, selected):
        ex['label'] = int(pseudo_labels[i])

    remaining_unlabeled = [unlabeled_dataset[i] for i in range(len(unlabeled_dataset)) if i not in selected_indices]
    updated_train = train_dataset + selected
    return updated_train, remaining_unlabeled


In [None]:
def predict(trainer, train_dataset, unlabeled_dataset, label_list, num_k):
    predictions = trainer.predict(unlabeled_dataset)
    logits = predictions.predictions

    updated_train, remaining_unlabeled = sort(train_dataset, unlabeled_dataset, logits, label_list, num_k)

    return updated_train, remaining_unlabeled

In [78]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
def self_training_loop():
    
    set_seed(42)
    
    train_dataset = Dataset.from_pandas(en_train_df).map(preprocess, batched=True).to_list()
    unlabeled_dataset = Dataset.from_pandas(fil_train_df.drop(columns=['label'], errors='ignore')).map(preprocess, batched=True).to_list()
    fil_val_dataset = Dataset.from_pandas(fil_test_df).map(preprocess, batched=True)
    en_val_dataset = Dataset.from_pandas(en_test_df).map(preprocess, batched=True)

    label_list = list(range(num_labels))

    best_f1 = -1.0
    best_iteration = -1
    best_epoch = -1

    for i in range(num_self_train):
        print(f"\nSELF-LEARNING ITERATION {i + 1}/{num_self_train}")

        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        base_output_dir = f"./results/sl_adv/{model_name.replace('/', '_')}" if adv_training else f"./results/sl/{model_name.replace('/', '_')}"
        iter_output_dir = f"{base_output_dir}/iter_{i+1}"

        training_args = TrainingArguments(
            output_dir=iter_output_dir,
            eval_strategy='epoch',            
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=learning_rate,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            greater_is_better=True,
            save_total_limit=1,
            report_to='none',
            seed=42,
        )

        # Prepare training dataset for this iteration
        train_ds = Dataset.from_list(train_dataset).map(preprocess, batched=True)

        # Detect embedding name only if adversarial training is on
        embedding_name = get_embedding_name(model_name) if adv_training else None

        # Create trainer
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=fil_val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            adv_training=adv_training,
            emb_name=embedding_name
        )

        trainer.train()

        # Run pseudo-labeling only if more training rounds are left
        if i < num_self_train - 1:
            train_dataset, unlabeled_dataset = predict(
                trainer, train_dataset, unlabeled_dataset, label_list, num_top_k
            )
            print(f"\nTraining set size after this round: {len(train_dataset)}")
            print(f"Remaining unlabeled examples after this round: {len(unlabeled_dataset)}")

        # Evaluate current model on Filipino validation set
        fil_predictions = trainer.predict(fil_val_dataset)
        fil_logits = fil_predictions.predictions
        fil_labels = fil_predictions.label_ids if fil_predictions.label_ids is not None else np.argmax(fil_logits, axis=1)
        fil_metrics = compute_metrics((fil_logits, fil_labels))
        print(f"Metrics on Filipino Validation Set: {fil_metrics}")

        # Evaluate current model on English validation set
        en_predictions = trainer.predict(en_val_dataset)
        en_logits = en_predictions.predictions
        en_labels = en_predictions.label_ids if en_predictions.label_ids is not None else np.argmax(en_logits, axis=1)
        en_metrics = compute_metrics((en_logits, en_labels))
        print(f"Metrics on English validation Set: {en_metrics}")

        if fil_metrics['f1'] > best_f1:
            best_f1 = fil_metrics['f1']
            best_iteration = i + 1
            best_epoch = trainer.state.epoch
            print(f"New best model found on iteration {best_iteration}, epoch {best_epoch:.1f} with F1 = {best_f1:.4f}")

    print(f"\nBest overall model was from iteration {best_iteration}, epoch {best_epoch:.1f} with F1 = {best_f1:.4f}")


## **Self-learning**

### mBERT

In [20]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:01<00:00, 19140.82 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 21322.44 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 23526.03 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 24750.15 examples/s]



SELF-LEARNING ITERATION 1/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 21392.44 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6231,1.836078,0.434791,0.44425,4.435375
2,1.0082,1.965006,0.421148,0.439205,4.353164
3,0.7599,1.953064,0.441011,0.458741,4.232366


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 791), Confidence range: 0.9181–0.9498
Class 1: 40 instances selected (out of 1616), Confidence range: 0.8163–0.8618
Class 2: 40 instances selected (out of 371), Confidence range: 0.9357–0.9631
Class 3: 40 instances selected (out of 79), Confidence range: 0.5519–0.9036
Class 4: 40 instances selected (out of 143), Confidence range: 0.7109–0.9436
Class 5: 40 instances selected (out of 769), Confidence range: 0.9213–0.9352
Class 6: 40 instances selected (out of 2557), Confidence range: 0.9771–0.9798
Class 7: 40 instances selected (out of 584), Confidence range: 0.9017–0.9551
Class 8: 40 instances selected (out of 2159), Confidence range: 0.9700–0.9721
Class 9: 40 instances selected (out of 3066), Confidence range: 0.8990–0.9256
Class 10: 40 instances selected (out of 374), Confidence range: 0.6817–0.8714
Class 11: 40 instances selected (out of 223), Confidence range: 0.7922–0.9358
Class 12: 40 instances selecte

Metrics on Filipino Validation Set: {'accuracy': 0.4410112359550562, 'f1': 0.4587407732423319, 'rmse': np.float64(4.232365927978654)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689478321905268, 'rmse': np.float64(3.421841609426129)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4587

SELF-LEARNING ITERATION 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:00<00:00, 21117.71 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5452,2.062803,0.368579,0.390707,4.268924
2,0.9664,2.140092,0.393459,0.424088,4.195895
3,0.7347,2.221549,0.388644,0.419401,4.130083


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1052), Confidence range: 0.9364–0.9512
Class 1: 40 instances selected (out of 1414), Confidence range: 0.8041–0.8723
Class 2: 40 instances selected (out of 294), Confidence range: 0.9271–0.9550
Class 3: 40 instances selected (out of 234), Confidence range: 0.6925–0.9077
Class 4: 40 instances selected (out of 628), Confidence range: 0.8531–0.9388
Class 5: 40 instances selected (out of 1218), Confidence range: 0.8867–0.9269
Class 6: 40 instances selected (out of 2029), Confidence range: 0.9681–0.9728
Class 7: 40 instances selected (out of 647), Confidence range: 0.9143–0.9457
Class 8: 40 instances selected (out of 2067), Confidence range: 0.9657–0.9686
Class 9: 40 instances selected (out of 2677), Confidence range: 0.8742–0.9075
Class 10: 40 instances selected (out of 1396), Confidence range: 0.8662–0.9031
Class 11: 40 instances selected (out of 333), Confidence range: 0.8563–0.9255
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.39345906902086675, 'f1': 0.42408771241429094, 'rmse': np.float64(4.1958953419629355)}


Metrics on English validation Set: {'accuracy': 0.6866, 'f1': 0.6855292050832662, 'rmse': np.float64(3.4478108996869303)}

SELF-LEARNING ITERATION 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:01<00:00, 21191.96 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5347,2.241271,0.342295,0.372288,4.415358
2,0.9338,2.334151,0.35634,0.386868,4.197736
3,0.7,2.359192,0.364767,0.398668,4.170065


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1050), Confidence range: 0.9643–0.9707
Class 1: 40 instances selected (out of 1338), Confidence range: 0.8855–0.9123
Class 2: 40 instances selected (out of 332), Confidence range: 0.9551–0.9701
Class 3: 40 instances selected (out of 515), Confidence range: 0.8719–0.9351
Class 4: 40 instances selected (out of 699), Confidence range: 0.9181–0.9591
Class 5: 40 instances selected (out of 1066), Confidence range: 0.9307–0.9498
Class 6: 40 instances selected (out of 1850), Confidence range: 0.9726–0.9760
Class 7: 40 instances selected (out of 972), Confidence range: 0.9559–0.9670
Class 8: 40 instances selected (out of 2014), Confidence range: 0.9726–0.9740
Class 9: 40 instances selected (out of 2614), Confidence range: 0.9247–0.9442
Class 10: 40 instances selected (out of 1664), Confidence range: 0.9339–0.9440
Class 11: 40 instances selected (out of 762), Confidence range: 0.9507–0.9652
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.3647672552166934, 'f1': 0.3986676154162394, 'rmse': np.float64(4.170065494441229)}


Metrics on English validation Set: {'accuracy': 0.696, 'f1': 0.694937290834445, 'rmse': np.float64(3.3450859480736814)}

SELF-LEARNING ITERATION 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21800/21800 [00:01<00:00, 21054.72 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.4979,2.441643,0.309591,0.333653,4.294744
2,0.9204,2.474061,0.338684,0.369457,4.231347
3,0.6931,2.581924,0.343499,0.374995,4.243823


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 884), Confidence range: 0.9518–0.9600
Class 1: 40 instances selected (out of 1463), Confidence range: 0.9094–0.9270
Class 2: 40 instances selected (out of 363), Confidence range: 0.9573–0.9704
Class 3: 40 instances selected (out of 920), Confidence range: 0.9297–0.9547
Class 4: 40 instances selected (out of 898), Confidence range: 0.9309–0.9644
Class 5: 40 instances selected (out of 869), Confidence range: 0.9312–0.9504
Class 6: 40 instances selected (out of 1572), Confidence range: 0.9709–0.9743
Class 7: 40 instances selected (out of 1028), Confidence range: 0.9594–0.9675
Class 8: 40 instances selected (out of 1824), Confidence range: 0.9739–0.9758
Class 9: 40 instances selected (out of 2500), Confidence range: 0.9455–0.9595
Class 10: 40 instances selected (out of 1773), Confidence range: 0.9510–0.9580
Class 11: 40 instances selected (out of 831), Confidence range: 0.9503–0.9614
Class 12: 40 instances sele

Metrics on Filipino Validation Set: {'accuracy': 0.3434991974317817, 'f1': 0.3749950998116085, 'rmse': np.float64(4.2438228170752765)}


Metrics on English validation Set: {'accuracy': 0.6894, 'f1': 0.6888677507371139, 'rmse': np.float64(3.3746999866654814)}

SELF-LEARNING ITERATION 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 22400/22400 [00:01<00:00, 21580.59 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.46,2.495588,0.312801,0.348815,4.40273
2,0.895,2.65561,0.333467,0.364908,4.288479
3,0.6717,2.767834,0.339286,0.371824,4.27496


Metrics on Filipino Validation Set: {'accuracy': 0.3392857142857143, 'f1': 0.3718241072539561, 'rmse': np.float64(4.274959754065107)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689702525400022, 'rmse': np.float64(3.3487012407797745)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4587


### XLM-RoBERTa

In [40]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:00<00:00, 24736.33 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 21720.05 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 15021.13 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 24648.95 examples/s]



SELF-LEARNING ITERATION 1/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 21904.20 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6231,1.836078,0.434791,0.44425,4.435375
2,1.0082,1.965006,0.421148,0.439205,4.353164
3,0.7599,1.953064,0.441011,0.458741,4.232366


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 791), Confidence range: 0.9181–0.9498
Class 1: 40 instances selected (out of 1616), Confidence range: 0.8163–0.8618
Class 2: 40 instances selected (out of 371), Confidence range: 0.9357–0.9631
Class 3: 40 instances selected (out of 79), Confidence range: 0.5519–0.9036
Class 4: 40 instances selected (out of 143), Confidence range: 0.7109–0.9436
Class 5: 40 instances selected (out of 769), Confidence range: 0.9213–0.9352
Class 6: 40 instances selected (out of 2557), Confidence range: 0.9771–0.9798
Class 7: 40 instances selected (out of 584), Confidence range: 0.9017–0.9551
Class 8: 40 instances selected (out of 2159), Confidence range: 0.9700–0.9721
Class 9: 40 instances selected (out of 3066), Confidence range: 0.8990–0.9256
Class 10: 40 instances selected (out of 374), Confidence range: 0.6817–0.8714
Class 11: 40 instances selected (out of 223), Confidence range: 0.7922–0.9358
Class 12: 40 instances selecte

Metrics on Filipino Validation Set: {'accuracy': 0.4410112359550562, 'f1': 0.4587407732423319, 'rmse': np.float64(4.232365927978654)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689478321905268, 'rmse': np.float64(3.421841609426129)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4587

SELF-LEARNING ITERATION 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:01<00:00, 13817.05 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5452,2.062803,0.368579,0.390707,4.268924
2,0.9664,2.140092,0.393459,0.424088,4.195895
3,0.7347,2.221549,0.388644,0.419401,4.130083


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1052), Confidence range: 0.9364–0.9512
Class 1: 40 instances selected (out of 1414), Confidence range: 0.8041–0.8723
Class 2: 40 instances selected (out of 294), Confidence range: 0.9271–0.9550
Class 3: 40 instances selected (out of 234), Confidence range: 0.6925–0.9077
Class 4: 40 instances selected (out of 628), Confidence range: 0.8531–0.9388
Class 5: 40 instances selected (out of 1218), Confidence range: 0.8867–0.9269
Class 6: 40 instances selected (out of 2029), Confidence range: 0.9681–0.9728
Class 7: 40 instances selected (out of 647), Confidence range: 0.9143–0.9457
Class 8: 40 instances selected (out of 2067), Confidence range: 0.9657–0.9686
Class 9: 40 instances selected (out of 2677), Confidence range: 0.8742–0.9075
Class 10: 40 instances selected (out of 1396), Confidence range: 0.8662–0.9031
Class 11: 40 instances selected (out of 333), Confidence range: 0.8563–0.9255
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.39345906902086675, 'f1': 0.42408771241429094, 'rmse': np.float64(4.1958953419629355)}


Metrics on English validation Set: {'accuracy': 0.6866, 'f1': 0.6855292050832662, 'rmse': np.float64(3.4478108996869303)}

SELF-LEARNING ITERATION 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:01<00:00, 14523.76 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5347,2.241271,0.342295,0.372288,4.415358
2,0.9338,2.334151,0.35634,0.386868,4.197736
3,0.7,2.359192,0.364767,0.398668,4.170065


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1050), Confidence range: 0.9643–0.9707
Class 1: 40 instances selected (out of 1338), Confidence range: 0.8855–0.9123
Class 2: 40 instances selected (out of 332), Confidence range: 0.9551–0.9701
Class 3: 40 instances selected (out of 515), Confidence range: 0.8719–0.9351
Class 4: 40 instances selected (out of 699), Confidence range: 0.9181–0.9591
Class 5: 40 instances selected (out of 1066), Confidence range: 0.9307–0.9498
Class 6: 40 instances selected (out of 1850), Confidence range: 0.9726–0.9760
Class 7: 40 instances selected (out of 972), Confidence range: 0.9559–0.9670
Class 8: 40 instances selected (out of 2014), Confidence range: 0.9726–0.9740
Class 9: 40 instances selected (out of 2614), Confidence range: 0.9247–0.9442
Class 10: 40 instances selected (out of 1664), Confidence range: 0.9339–0.9440
Class 11: 40 instances selected (out of 762), Confidence range: 0.9507–0.9652
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.3647672552166934, 'f1': 0.3986676154162394, 'rmse': np.float64(4.170065494441229)}


Metrics on English validation Set: {'accuracy': 0.696, 'f1': 0.694937290834445, 'rmse': np.float64(3.3450859480736814)}

SELF-LEARNING ITERATION 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21800/21800 [00:01<00:00, 14132.60 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.4979,2.441643,0.309591,0.333653,4.294744
2,0.9204,2.474061,0.338684,0.369457,4.231347
3,0.6931,2.581924,0.343499,0.374995,4.243823


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 884), Confidence range: 0.9518–0.9600
Class 1: 40 instances selected (out of 1463), Confidence range: 0.9094–0.9270
Class 2: 40 instances selected (out of 363), Confidence range: 0.9573–0.9704
Class 3: 40 instances selected (out of 920), Confidence range: 0.9297–0.9547
Class 4: 40 instances selected (out of 898), Confidence range: 0.9309–0.9644
Class 5: 40 instances selected (out of 869), Confidence range: 0.9312–0.9504
Class 6: 40 instances selected (out of 1572), Confidence range: 0.9709–0.9743
Class 7: 40 instances selected (out of 1028), Confidence range: 0.9594–0.9675
Class 8: 40 instances selected (out of 1824), Confidence range: 0.9739–0.9758
Class 9: 40 instances selected (out of 2500), Confidence range: 0.9455–0.9595
Class 10: 40 instances selected (out of 1773), Confidence range: 0.9510–0.9580
Class 11: 40 instances selected (out of 831), Confidence range: 0.9503–0.9614
Class 12: 40 instances sele

Metrics on Filipino Validation Set: {'accuracy': 0.3434991974317817, 'f1': 0.3749950998116085, 'rmse': np.float64(4.2438228170752765)}


Metrics on English validation Set: {'accuracy': 0.6894, 'f1': 0.6888677507371139, 'rmse': np.float64(3.3746999866654814)}

SELF-LEARNING ITERATION 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 22400/22400 [00:01<00:00, 14517.65 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.46,2.495588,0.312801,0.348815,4.40273
2,0.895,2.65561,0.333467,0.364908,4.288479
3,0.6717,2.767834,0.339286,0.371824,4.27496


Metrics on Filipino Validation Set: {'accuracy': 0.3392857142857143, 'f1': 0.3718241072539561, 'rmse': np.float64(4.274959754065107)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689702525400022, 'rmse': np.float64(3.3487012407797745)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4587


## **Self-learning with Adversarial**

### mBERT

In [60]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:00<00:00, 25605.97 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 21881.98 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 15276.87 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 26140.42 examples/s]



SELF-LEARNING ITERATION 1/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 21915.58 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6231,1.836078,0.434791,0.44425,4.435375
2,1.0082,1.965006,0.421148,0.439205,4.353164
3,0.7599,1.953064,0.441011,0.458741,4.232366


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 791), Confidence range: 0.9181–0.9498
Class 1: 40 instances selected (out of 1616), Confidence range: 0.8163–0.8618
Class 2: 40 instances selected (out of 371), Confidence range: 0.9357–0.9631
Class 3: 40 instances selected (out of 79), Confidence range: 0.5519–0.9036
Class 4: 40 instances selected (out of 143), Confidence range: 0.7109–0.9436
Class 5: 40 instances selected (out of 769), Confidence range: 0.9213–0.9352
Class 6: 40 instances selected (out of 2557), Confidence range: 0.9771–0.9798
Class 7: 40 instances selected (out of 584), Confidence range: 0.9017–0.9551
Class 8: 40 instances selected (out of 2159), Confidence range: 0.9700–0.9721
Class 9: 40 instances selected (out of 3066), Confidence range: 0.8990–0.9256
Class 10: 40 instances selected (out of 374), Confidence range: 0.6817–0.8714
Class 11: 40 instances selected (out of 223), Confidence range: 0.7922–0.9358
Class 12: 40 instances selecte

Metrics on Filipino Validation Set: {'accuracy': 0.4410112359550562, 'f1': 0.4587407732423319, 'rmse': np.float64(4.232365927978654)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689478321905268, 'rmse': np.float64(3.421841609426129)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4587

SELF-LEARNING ITERATION 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:00<00:00, 20970.75 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5452,2.062803,0.368579,0.390707,4.268924
2,0.9664,2.140092,0.393459,0.424088,4.195895
3,0.7347,2.221549,0.388644,0.419401,4.130083


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1052), Confidence range: 0.9364–0.9512
Class 1: 40 instances selected (out of 1414), Confidence range: 0.8041–0.8723
Class 2: 40 instances selected (out of 294), Confidence range: 0.9271–0.9550
Class 3: 40 instances selected (out of 234), Confidence range: 0.6925–0.9077
Class 4: 40 instances selected (out of 628), Confidence range: 0.8531–0.9388
Class 5: 40 instances selected (out of 1218), Confidence range: 0.8867–0.9269
Class 6: 40 instances selected (out of 2029), Confidence range: 0.9681–0.9728
Class 7: 40 instances selected (out of 647), Confidence range: 0.9143–0.9457
Class 8: 40 instances selected (out of 2067), Confidence range: 0.9657–0.9686
Class 9: 40 instances selected (out of 2677), Confidence range: 0.8742–0.9075
Class 10: 40 instances selected (out of 1396), Confidence range: 0.8662–0.9031
Class 11: 40 instances selected (out of 333), Confidence range: 0.8563–0.9255
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.39345906902086675, 'f1': 0.42408771241429094, 'rmse': np.float64(4.1958953419629355)}


Metrics on English validation Set: {'accuracy': 0.6866, 'f1': 0.6855292050832662, 'rmse': np.float64(3.4478108996869303)}

SELF-LEARNING ITERATION 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:00<00:00, 21631.78 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5347,2.241271,0.342295,0.372288,4.415358
2,0.9338,2.334151,0.35634,0.386868,4.197736
3,0.7,2.359192,0.364767,0.398668,4.170065


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1050), Confidence range: 0.9643–0.9707
Class 1: 40 instances selected (out of 1338), Confidence range: 0.8855–0.9123
Class 2: 40 instances selected (out of 332), Confidence range: 0.9551–0.9701
Class 3: 40 instances selected (out of 515), Confidence range: 0.8719–0.9351
Class 4: 40 instances selected (out of 699), Confidence range: 0.9181–0.9591
Class 5: 40 instances selected (out of 1066), Confidence range: 0.9307–0.9498
Class 6: 40 instances selected (out of 1850), Confidence range: 0.9726–0.9760
Class 7: 40 instances selected (out of 972), Confidence range: 0.9559–0.9670
Class 8: 40 instances selected (out of 2014), Confidence range: 0.9726–0.9740
Class 9: 40 instances selected (out of 2614), Confidence range: 0.9247–0.9442
Class 10: 40 instances selected (out of 1664), Confidence range: 0.9339–0.9440
Class 11: 40 instances selected (out of 762), Confidence range: 0.9507–0.9652
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.3647672552166934, 'f1': 0.3986676154162394, 'rmse': np.float64(4.170065494441229)}


Metrics on English validation Set: {'accuracy': 0.696, 'f1': 0.694937290834445, 'rmse': np.float64(3.3450859480736814)}

SELF-LEARNING ITERATION 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21800/21800 [00:00<00:00, 21903.75 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.4979,2.441643,0.309591,0.333653,4.294744
2,0.9204,2.474061,0.338684,0.369457,4.231347
3,0.6931,2.581924,0.343499,0.374995,4.243823


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 884), Confidence range: 0.9518–0.9600
Class 1: 40 instances selected (out of 1463), Confidence range: 0.9094–0.9270
Class 2: 40 instances selected (out of 363), Confidence range: 0.9573–0.9704
Class 3: 40 instances selected (out of 920), Confidence range: 0.9297–0.9547
Class 4: 40 instances selected (out of 898), Confidence range: 0.9309–0.9644
Class 5: 40 instances selected (out of 869), Confidence range: 0.9312–0.9504
Class 6: 40 instances selected (out of 1572), Confidence range: 0.9709–0.9743
Class 7: 40 instances selected (out of 1028), Confidence range: 0.9594–0.9675
Class 8: 40 instances selected (out of 1824), Confidence range: 0.9739–0.9758
Class 9: 40 instances selected (out of 2500), Confidence range: 0.9455–0.9595
Class 10: 40 instances selected (out of 1773), Confidence range: 0.9510–0.9580
Class 11: 40 instances selected (out of 831), Confidence range: 0.9503–0.9614
Class 12: 40 instances sele

Metrics on Filipino Validation Set: {'accuracy': 0.3434991974317817, 'f1': 0.3749950998116085, 'rmse': np.float64(4.2438228170752765)}


Metrics on English validation Set: {'accuracy': 0.6894, 'f1': 0.6888677507371139, 'rmse': np.float64(3.3746999866654814)}

SELF-LEARNING ITERATION 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 22400/22400 [00:01<00:00, 21820.56 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.46,2.495588,0.312801,0.348815,4.40273
2,0.895,2.65561,0.333467,0.364908,4.288479
3,0.6717,2.767834,0.339286,0.371824,4.27496


Metrics on Filipino Validation Set: {'accuracy': 0.3392857142857143, 'f1': 0.3718241072539561, 'rmse': np.float64(4.274959754065107)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689702525400022, 'rmse': np.float64(3.3487012407797745)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4587


### XLM-RoBERTa

In [80]:
self_training_loop()

Map: 100%|██████████| 20000/20000 [00:00<00:00, 25115.03 examples/s]
Map: 100%|██████████| 19998/19998 [00:00<00:00, 21892.52 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 14862.91 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 25717.06 examples/s]



SELF-LEARNING ITERATION 1/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20000/20000 [00:00<00:00, 21648.30 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6231,1.836078,0.434791,0.44425,4.435375
2,1.0082,1.965006,0.421148,0.439205,4.353164
3,0.7599,1.953064,0.441011,0.458741,4.232366


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 791), Confidence range: 0.9181–0.9498
Class 1: 40 instances selected (out of 1616), Confidence range: 0.8163–0.8618
Class 2: 40 instances selected (out of 371), Confidence range: 0.9357–0.9631
Class 3: 40 instances selected (out of 79), Confidence range: 0.5519–0.9036
Class 4: 40 instances selected (out of 143), Confidence range: 0.7109–0.9436
Class 5: 40 instances selected (out of 769), Confidence range: 0.9213–0.9352
Class 6: 40 instances selected (out of 2557), Confidence range: 0.9771–0.9798
Class 7: 40 instances selected (out of 584), Confidence range: 0.9017–0.9551
Class 8: 40 instances selected (out of 2159), Confidence range: 0.9700–0.9721
Class 9: 40 instances selected (out of 3066), Confidence range: 0.8990–0.9256
Class 10: 40 instances selected (out of 374), Confidence range: 0.6817–0.8714
Class 11: 40 instances selected (out of 223), Confidence range: 0.7922–0.9358
Class 12: 40 instances selecte

Metrics on Filipino Validation Set: {'accuracy': 0.4410112359550562, 'f1': 0.4587407732423319, 'rmse': np.float64(4.232365927978654)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689478321905268, 'rmse': np.float64(3.421841609426129)}
New best model found on iteration 1, epoch 3.0 with F1 = 0.4587

SELF-LEARNING ITERATION 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20600/20600 [00:01<00:00, 13482.20 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5452,2.062803,0.368579,0.390707,4.268924
2,0.9664,2.140092,0.393459,0.424088,4.195895
3,0.7347,2.221549,0.388644,0.419401,4.130083


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1052), Confidence range: 0.9364–0.9512
Class 1: 40 instances selected (out of 1414), Confidence range: 0.8041–0.8723
Class 2: 40 instances selected (out of 294), Confidence range: 0.9271–0.9550
Class 3: 40 instances selected (out of 234), Confidence range: 0.6925–0.9077
Class 4: 40 instances selected (out of 628), Confidence range: 0.8531–0.9388
Class 5: 40 instances selected (out of 1218), Confidence range: 0.8867–0.9269
Class 6: 40 instances selected (out of 2029), Confidence range: 0.9681–0.9728
Class 7: 40 instances selected (out of 647), Confidence range: 0.9143–0.9457
Class 8: 40 instances selected (out of 2067), Confidence range: 0.9657–0.9686
Class 9: 40 instances selected (out of 2677), Confidence range: 0.8742–0.9075
Class 10: 40 instances selected (out of 1396), Confidence range: 0.8662–0.9031
Class 11: 40 instances selected (out of 333), Confidence range: 0.8563–0.9255
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.39345906902086675, 'f1': 0.42408771241429094, 'rmse': np.float64(4.1958953419629355)}


Metrics on English validation Set: {'accuracy': 0.6866, 'f1': 0.6855292050832662, 'rmse': np.float64(3.4478108996869303)}

SELF-LEARNING ITERATION 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21200/21200 [00:01<00:00, 14476.92 examples/s]
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.5347,2.241271,0.342295,0.372288,4.415358
2,0.9338,2.334151,0.35634,0.386868,4.197736
3,0.7,2.359192,0.364767,0.398668,4.170065


Pseudo-labeled instance count per class:
Class 0: 40 instances selected (out of 1050), Confidence range: 0.9643–0.9707
Class 1: 40 instances selected (out of 1338), Confidence range: 0.8855–0.9123
Class 2: 40 instances selected (out of 332), Confidence range: 0.9551–0.9701
Class 3: 40 instances selected (out of 515), Confidence range: 0.8719–0.9351
Class 4: 40 instances selected (out of 699), Confidence range: 0.9181–0.9591
Class 5: 40 instances selected (out of 1066), Confidence range: 0.9307–0.9498
Class 6: 40 instances selected (out of 1850), Confidence range: 0.9726–0.9760
Class 7: 40 instances selected (out of 972), Confidence range: 0.9559–0.9670
Class 8: 40 instances selected (out of 2014), Confidence range: 0.9726–0.9740
Class 9: 40 instances selected (out of 2614), Confidence range: 0.9247–0.9442
Class 10: 40 instances selected (out of 1664), Confidence range: 0.9339–0.9440
Class 11: 40 instances selected (out of 762), Confidence range: 0.9507–0.9652
Class 12: 40 instances sel

Metrics on Filipino Validation Set: {'accuracy': 0.3647672552166934, 'f1': 0.3986676154162394, 'rmse': np.float64(4.170065494441229)}


Metrics on English validation Set: {'accuracy': 0.696, 'f1': 0.694937290834445, 'rmse': np.float64(3.3450859480736814)}

SELF-LEARNING ITERATION 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 21800/21800 [00:01<00:00, 14913.65 examples/s]
  super().__init__(*args, **kwargs)


## **Investigating the Results**

In [None]:
model_path = './results/sl_adv/xlm-roberta-base/iter_1/checkpoint-1875'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [94]:
fil_val_dataset = Dataset.from_pandas(fil_test_df).map(preprocess, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 22400/22400 [00:01<00:00, 14137.90 examples/s]
  super().__init__(*args, **kwargs)


In [None]:
self_training_loop()

Metrics on Filipino Validation Set: {'accuracy': 0.3392857142857143, 'f1': 0.3718241072539561, 'rmse': np.float64(4.274959754065107)}


Metrics on English validation Set: {'accuracy': 0.6906, 'f1': 0.689702525400022, 'rmse': np.float64(3.3487012407797745)}

Best overall model was from iteration 1, epoch 3.0 with F1 = 0.4587


## **Investigating the Results**

In [None]:
model_path = "./results/sl_adv/xlm-roberta-base/iter_1/checkpoint-1875"  # or your specific path

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
fil_val_dataset = Dataset.from_pandas(fil_test_df).map(preprocess, batched=True)

In [None]:
trainer = Trainer(model=model, tokenizer=tokenizer)

predictions = trainer.predict(fil_val_dataset)
logits = predictions.predictions
true_labels = predictions.label_ids

In [None]:
predicted_labels = np.argmax(logits, axis=1)

report = classification_report(true_labels, predicted_labels, digits=4)
print(report)

In [None]:
probs = F.softmax(torch.tensor(logits), dim=-1)
confidence_scores = torch.max(probs, dim=1).values.numpy()

eval_df = fil_val_dataset.to_pandas()

eval_df['predicted_label'] = predicted_labels
eval_df['match'] = eval_df['label'] == eval_df['predicted_label']
eval_df['confidence'] = confidence_scores
eval_df = eval_df.drop(columns=['code_frames', 'input_ids', 'attention_mask'])

eval_df

In [None]:
eval_df.to_csv('datasets/results/xlm-r_adv1_results.csv')
