In [167]:
!pip -q install transformers accelerate datasets scikit-learn torch pandas numpy

In [168]:
import pandas as pd
import numpy as np
import torch
import random
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error

In [169]:
en_train_file = 'datasets/final/mpfc_train.csv'
fil_test_file = 'datasets/final/fil_test.csv'
en_test_file = 'datasets/final/mpfc_test.csv'

In [170]:
# === Load CSVs ===
en_train_df = pd.read_csv(en_train_file)
fil_test_df = pd.read_csv(fil_test_file)
en_test_df = pd.read_csv(en_test_file)

In [171]:
# === Ensure labels are ints ===
en_train_df['label'] = en_train_df['label'].astype(int)
fil_test_df['label'] = fil_test_df['label'].astype(int)
en_test_df['label'] = en_test_df['label'].astype(int)

In [172]:
en_train_df

Unnamed: 0,text,code_frames,label
0,Senator Sherwin Gatchalian filed a civil lawsu...,7,6
1,AVOID COLLATERAL DAMAGE FROM NRA'S CAMPAIGN,15,14
2,MANILA – Human immunodeficiency virus (HIV) in...,9,8
3,"MANILA, Philippines – President Ferdinand Marc...",2,1
4,Japanese Embassy in PH thanks DOJ over deporta...,14,13
...,...,...,...
19995,MANILA – President Ferdinand R. Marcos Jr. on ...,2,1
19996,"Do you have a question on the news - local, na...",12,11
19997,Davao Oriental 2nd district Rep. Cheeno Almari...,13,12
19998,The Philippine government is eyeing to deport ...,7,6


In [173]:
fil_test_df

Unnamed: 0,text,code_frames,label
0,Isang umano’y tinaguriang ‘shabu queen’ at lid...,7,6
1,Anthrax infection kumalat sa Cagayan,9,8
2,TESDA: Mga tech-voc graduate swak sa trabaho,10,9
3,Nagkamit ng unang pwesto ang isang Filipina st...,15,14
4,NEDA inaprub tapyas taripa sa e-vehicle,6,5
...,...,...,...
4979,"Typhoon Betty, patuloy na humihina sa karagata...",9,8
4980,'Sarap maging tatay!' Post ng netizen tungkol ...,11,10
4981,"TESDA, maglulunsad ng training programs para s...",2,1
4982,"Anne Curtis, nagdiwang ng kaarawan sa ‘It’s Sh...",15,14


In [174]:
en_test_df

Unnamed: 0,text,code_frames,label
0,Physical distancing in classrooms may be eased...,10,9
1,Bishop took on sensitive social issues\r\n,3,2
2,MANILA – President Ferdinand R. Marcos Jr. has...,14,13
3,Florida voters strongly support an increase in...,12,11
4,The Supreme Court had approved new state death...,5,4
...,...,...,...
4995,"Telemachus 'Tel' Orfanos, 27, survived mass sh...",10,9
4996,"""Open Carry Picnic"" -- a mix of a typical outd...",12,11
4997,ASEAN first: Philippine presidents and their s...,13,12
4998,The Bureau of Immigration (BI) has stopped ano...,7,6


In [201]:
# === CONFIGURATION ===

# Choose model: 'bert-base-multilingual-cased' or 'xlm-roberta-base'
model_name = 'xlm-roberta-base'

# Training hyperparameters
num_labels = 15
max_length = 128
learning_rate = 2e-5
train_batch_size = 32
eval_batch_size = 32
num_epochs = 3
weight_decay = 0.01

In [200]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # Or whatever seed you choose


In [202]:
# === Convert to Hugging Face datasets ===
en_train_ds = Dataset.from_pandas(en_train_df)
val_fil_ds = Dataset.from_pandas(fil_test_df)
val_en_ds = Dataset.from_pandas(en_test_df)

In [203]:
# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [204]:
def preprocess(example):
    return tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

en_train_ds = en_train_ds.map(preprocess, batched=True)
val_fil_ds = val_fil_ds.map(preprocess, batched=True)
val_en_ds = val_en_ds.map(preprocess, batched=True)

Map: 100%|██████████| 20000/20000 [00:00<00:00, 30514.85 examples/s]
Map: 100%|██████████| 4984/4984 [00:00<00:00, 28429.42 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 30863.12 examples/s]


In [205]:
# === Load model ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [206]:
# === Compute metrics using sklearn ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = np.array(labels)

    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'rmse': np.sqrt(mean_squared_error(labels, preds))
    }

In [207]:
output_dir = f"results/std/{model_name.replace('/', '_')}"

In [209]:
# === Training arguments ===
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy='epoch',            
    save_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=1,
    report_to='none',
    seed=42,
)

In [210]:
# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=en_train_ds,
    eval_dataset=val_fil_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


## **Standard Fine-tuning**

### mBERT

In [185]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6231,1.836078,0.434791,0.44425,4.435375
2,1.0082,1.965006,0.421148,0.439205,4.353164
3,0.7599,1.953064,0.441011,0.458741,4.232366


TrainOutput(global_step=1875, training_loss=1.130421923828125, metrics={'train_runtime': 9589.9918, 'train_samples_per_second': 6.257, 'train_steps_per_second': 0.196, 'total_flos': 3947126492160000.0, 'train_loss': 1.130421923828125, 'epoch': 3.0})

In [188]:
# === Final evaluation on English ===
final_en_metrics = trainer.evaluate(eval_dataset=val_en_ds)

print('Final Evaluation on English Validation Set')
print(f"Accuracy : {final_en_metrics['eval_accuracy']:.4f}")
print(f"F1 Score : {final_en_metrics['eval_f1']:.4f}")
print(f"RMSE     : {final_en_metrics['eval_rmse']:.4f}")



Final Evaluation on English Validation Set
Accuracy : 0.6906
F1 Score : 0.6895
RMSE     : 3.4218


In [189]:
# === Final evaluation on Filipino ===
final_fil_metrics = trainer.evaluate(eval_dataset=val_fil_ds)

print('Final Evaluation on Filipino Validation Set')
print(f"Accuracy : {final_fil_metrics['eval_accuracy']:.4f}")
print(f"F1 Score : {final_fil_metrics['eval_f1']:.4f}")
print(f"RMSE     : {final_fil_metrics['eval_rmse']:.4f}")


Final Evaluation on Filipino Validation Set
Accuracy : 0.4410
F1 Score : 0.4587
RMSE     : 4.2324


### XLM-RoBERTa

In [211]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,1.6715,1.798217,0.442817,0.445454,4.006816
2,1.0373,1.813685,0.461878,0.471468,3.837228
3,0.8282,1.788131,0.472311,0.48153,3.831629


TrainOutput(global_step=1875, training_loss=1.1789679361979166, metrics={'train_runtime': 10380.328, 'train_samples_per_second': 5.78, 'train_steps_per_second': 0.181, 'total_flos': 3947126492160000.0, 'train_loss': 1.1789679361979166, 'epoch': 3.0})

In [212]:
# === Final evaluation on English ===
final_en_metrics = trainer.evaluate(eval_dataset=val_en_ds)

print('Final Evaluation on English Validation Set')
print(f"Accuracy : {final_en_metrics['eval_accuracy']:.4f}")
print(f"F1 Score : {final_en_metrics['eval_f1']:.4f}")
print(f"RMSE     : {final_en_metrics['eval_rmse']:.4f}")

Final Evaluation on English Validation Set
Accuracy : 0.7040
F1 Score : 0.7018
RMSE     : 3.2117


In [213]:
# === Final evaluation on Filipino ===
final_fil_metrics = trainer.evaluate(eval_dataset=val_fil_ds)

print('Final Evaluation on Filipino Validation Set')
print(f"Accuracy : {final_fil_metrics['eval_accuracy']:.4f}")
print(f"F1 Score : {final_fil_metrics['eval_f1']:.4f}")
print(f"RMSE     : {final_fil_metrics['eval_rmse']:.4f}")

Final Evaluation on Filipino Validation Set
Accuracy : 0.4723
F1 Score : 0.4815
RMSE     : 3.8316


## **save predictions**

In [None]:
# === Predict on Filipino set using best model ===
fil_preds_output = trainer.predict(val_fil_ds)
logits = fil_preds_output.predictions
fil_preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()

In [None]:
# === Attach predictions to original Filipino DataFrame ===
fil_test_df['predicted_label'] = fil_preds
fil_test_df['correct'] = fil_test_df['label'] == fil_test_df['predicted_label']
fil_test_df.head()

In [None]:
# === Save to CSV for inspection ===
fil_test_df.to_csv('dataset/fil_predictions.csv', index=False)