In [1]:
!pip -q install transformers accelerate datasets scikit-learn torch pandas numpy

In [2]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
en_train_file = 'datasets/pfc_train.csv'
fil_test_file = 'datasets/fil_test_1000.csv'
en_test_file = 'datasets/pfc_test.csv'

In [4]:
# === Load CSVs ===
# en_train_df = pd.read_csv(en_train_file)
# fil_test_df = pd.read_csv(fil_test_file)
# en_test_df = pd.read_csv(en_test_file)

en_train_df = pd.read_csv(en_train_file).sample(n=80, random_state=42)
fil_test_df = pd.read_csv(fil_test_file).sample(n=20, random_state=42)
en_test_df = pd.read_csv(en_test_file).sample(n=20, random_state=42)

In [5]:
en_train_df

Unnamed: 0,text,code_frames,label
6779,DOH asks private sector not to procure bivalen...,2,1
9708,The Philippines will continue to engage with c...,14,13
7590,6 domestic flights canceled due to inclement w...,15,14
6449,The Department of Justice (DOJ) is planning to...,7,6
518,"MANILA, Philippines – Just like in previous ye...",6,5
...,...,...,...
5247,The Department of National Defense (DND) on Sa...,14,13
346,"MANILA, Philippines – President Ferdinand Marc...",13,12
6475,Manila archdiocese to establish 24/7 confessio...,15,14
3833,The Department of Budget and Management (DBM) ...,1,0


In [6]:
fil_test_df

Unnamed: 0,text,code-frame,label
521,NAREKOBER ng mga otoridad ang bangkay ng ginan...,"7. Law and Order, Crime and Justice",6.0
737,"Minadaling importasyon ng asukal, kinuwestiyon...",12. Public Opinion,11.0
740,Lalaking tinulungan pa rin asong nalaglag sa p...,15. Other,14.0
660,Mga dumalo sa UniTeam rally nawalan ng cellpho...,"7. Law and Order, Crime and Justice",6.0
411,Telcos wala nang dahilan ngayon para hindi map...,13. Political,12.0
678,"Sa hangaring maabot ang mas maraming Pilipino,...",11. Cultural Identity,10.0
626,"China Telecom, ikatlong telco?",2. Capacity and Resources,1.0
513,Mayroong 157 flights ang naka-schedule na uma...,15. Other,14.0
859,Isinailalim sa sa state of calamity ng Sanggun...,9. Health and Safety,8.0
136,Mga Fil-Am rumampa sa protesta vs hate crime,4. Fairness and Equality,3.0


In [7]:
en_test_df

Unnamed: 0,text,code_frames,label
756,Lawmaker slams cops for inconsistent stories i...,7,6
642,The Philippines on Friday called on China anew...,14,13
2402,"MANILA, Philippines – Doctors “doctored” or fa...",7,6
1944,Justice Secretary Jesus Crispin Remulla on Tue...,7,6
252,MANILA – Senators on Tuesday President Ferdin...,5,4
353,Sandiganbayan affirms denial of ex-DBM exec's ...,7,6
1316,Driver of AUV in Salilig case found dead in Ta...,7,6
1642,MANILA – The country’s daily average of new co...,9,8
237,"MANILA, Philippines – The mothers of disappea...",7,6
1950,Senator Joseph Victor “JV” Ejercito on Tuesday...,6,5


In [8]:
# === CONFIGURATION ===

# Choose model: 'bert-base-multilingual-cased' or 'xlm-roberta-base'
model_name = "bert-base-multilingual-cased" # or "xlm-roberta-base"

# Training hyperparameters
num_labels = 15
max_length = 256
learning_rate = 2e-5
train_batch_size = 32
eval_batch_size = 32
num_epochs = 3
weight_decay = 0.01

In [9]:
# === Ensure labels are ints ===
en_train_df["label"] = en_train_df["label"].astype(int)
fil_test_df["label"] = fil_test_df["label"].astype(int)
en_test_df["label"] = en_test_df["label"].astype(int)

In [10]:
# === Convert to Hugging Face datasets ===
en_train_ds = Dataset.from_pandas(en_train_df)
val_fil_ds = Dataset.from_pandas(fil_test_df)
val_en_ds = Dataset.from_pandas(en_test_df)

In [11]:
# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
def preprocess(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

en_train_ds = en_train_ds.map(preprocess, batched=True)
val_fil_ds = val_fil_ds.map(preprocess, batched=True)
val_en_ds = val_en_ds.map(preprocess, batched=True)

Map: 100%|██████████| 80/80 [00:00<00:00, 2821.76 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 2228.52 examples/s]
Map: 100%|██████████| 20/20 [00:00<?, ? examples/s]


In [13]:
# === Load model ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# === Compute metrics using sklearn ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = np.array(labels)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
        "rmse": np.sqrt(mean_squared_error(labels, preds))
    }

In [23]:
# === Training arguments ===
training_args = TrainingArguments(
    output_dir=f"results/std/{model_name.replace('/', '_')}",
    eval_strategy="epoch",            # Filipino validation every epoch
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",
)

In [16]:
# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=en_train_ds,
    eval_dataset=val_fil_ds,  # Filipino validation set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Rmse
1,2.6386,2.717084,0.05,0.01,5.581219
2,2.5474,2.720411,0.05,0.009091,5.399074
3,2.5349,2.72643,0.05,0.009091,5.399074


TrainOutput(global_step=9, training_loss=2.573647075229221, metrics={'train_runtime': 354.8756, 'train_samples_per_second': 0.676, 'train_steps_per_second': 0.025, 'total_flos': 31577011937280.0, 'train_loss': 2.573647075229221, 'epoch': 3.0})

In [19]:
# === Final evaluation on English ===
final_en_metrics = trainer.evaluate(eval_dataset=val_en_ds)

print("Final Evaluation on English Validation Set")
print("------------------------------------------------")
print(f"Loss     : {final_en_metrics['eval_loss']:.4f}")
print(f"Accuracy : {final_en_metrics['eval_accuracy']:.4f}")
print(f"F1 Score : {final_en_metrics['eval_f1']:.4f}")
print(f"RMSE     : {final_en_metrics['eval_rmse']:.4f}")
print("------------------------------------------------")


Final Evaluation on English Validation Set
------------------------------------------------
Loss     : 2.6272
Accuracy : 0.1000
F1 Score : 0.0227
RMSE     : 3.5496
------------------------------------------------


In [18]:
# === Final evaluation on Filipino ===
final_fil_metrics = trainer.evaluate(eval_dataset=val_fil_ds)

print("Final Evaluation on Filipino Validation Set")
print("------------------------------------------------")
print(f"Loss     : {final_fil_metrics['eval_loss']:.4f}")
print(f"Accuracy : {final_fil_metrics['eval_accuracy']:.4f}")
print(f"F1 Score : {final_fil_metrics['eval_f1']:.4f}")
print(f"RMSE     : {final_fil_metrics['eval_rmse']:.4f}")
print("------------------------------------------------")


Final Evaluation on Filipino Validation Set
------------------------------------------------
Loss     : 2.7171
Accuracy : 0.0500
F1 Score : 0.0100
RMSE     : 5.5812
------------------------------------------------


In [20]:
# === Predict on Filipino set using best model ===
fil_preds_output = trainer.predict(val_fil_ds)
logits = fil_preds_output.predictions
fil_preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()

In [21]:
# === Attach predictions to original Filipino DataFrame ===
fil_test_df["predicted_label"] = fil_preds
fil_test_df["correct"] = fil_test_df["label"] == fil_test_df["predicted_label"]
fil_test_df.head()

Unnamed: 0,text,code-frame,label,predicted_label,correct
521,NAREKOBER ng mga otoridad ang bangkay ng ginan...,"7. Law and Order, Crime and Justice",6,5,False
737,"Minadaling importasyon ng asukal, kinuwestiyon...",12. Public Opinion,11,5,False
740,Lalaking tinulungan pa rin asong nalaglag sa p...,15. Other,14,5,False
660,Mga dumalo sa UniTeam rally nawalan ng cellpho...,"7. Law and Order, Crime and Justice",6,5,False
411,Telcos wala nang dahilan ngayon para hindi map...,13. Political,12,9,False


In [None]:
# === Save to CSV for inspection ===
fil_test_df.to_csv("dataset/fil_predictions.csv", index=False)