In [None]:
!pip install -q transformers datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
labels = [label for label in train_df.keys() if label not in ['id', 'text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

In [None]:
# compute weights from train class balance
y_train = train_df[labels].values
weights = y_train.sum(axis=0)/y_train.sum()
weights = max(weights)/weights

In [None]:
weights = [w/sum(weights) for w in weights]
weights

array([ 8.        ,  9.88235294, 14.29787234,  1.        ,  3.16981132,
        2.97345133])

In [None]:
from transformers import AutoTokenizer
import numpy as np
model_checkpoint = "google-bert/bert-base-multilingual-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import datasets
dataset = datasets.Dataset.from_pandas(train_df)
dataset=dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'],
        num_rows: 646
    })
    test: Dataset({
        features: ['id', 'text', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'],
        num_rows: 278
    })
})

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/646 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

In [None]:
encoded_dataset.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
batch_size = 32
metric_name = "f1_micro"
num_epochs=20

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-base-finetuned-sem_eval-{lang}",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to="none"
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.metrics import jaccard_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro',zero_division=0)
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro',zero_division=0)
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    jaccard = jaccard_score(y_true, y_pred, average = 'samples',zero_division=0)
    # return as dictionary
    metrics = {'f1_micro': f1_micro_average,
               'f1_macro': f1_macro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy,
               'jaccard': jaccard}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result


In [None]:
from typing import Optional
from torch import FloatTensor
from torch.nn import BCEWithLogitsLoss
import logging

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights: Optional[FloatTensor] = None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            class_weights = torch.tensor(class_weights).cuda()
            logging.info(f"Using multi-label classification with class weights", class_weights)
        self.loss_fct = BCEWithLogitsLoss(pos_weight=class_weights)

    def compute_loss(self, model, inputs, num_items_in_batch=batch_size, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        try:
            loss = self.loss_fct(outputs.logits.view(-1, model.num_labels), labels.view(-1, model.num_labels))
        except AttributeError:  # DataParallel
            loss = self.loss_fct(outputs.logits.view(-1, model.module.num_labels), labels.view(-1, model.num_labels))

        return (loss, outputs) if return_outputs else loss



In [None]:
trainer = Trainer(
  model,
  args,
  train_dataset=encoded_dataset["train"],
  eval_dataset=encoded_dataset["test"],
  processing_class=tokenizer,
  compute_metrics=compute_metrics,
)
trainer.train()


In [None]:
trainer.evaluate()

In [None]:
! pip install optuna
! pip install ray[tune]

In [None]:
from transformers import AutoModelForSequenceClassification

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                              problem_type="multi_label_classification",
                                                              num_labels=len(labels),
                                                              id2label=id2label,
                                                              label2id=label2id)

In [None]:
trainer = WeightedTrainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=weights
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [None]:
best_run

BestRun(run_id='3', objective=2.636894837446536, hyperparameters={'learning_rate': 1.1710707232271146e-05, 'num_train_epochs': 4, 'seed': 35, 'per_device_train_batch_size': 32}, run_summary=None)

In [None]:
!ls

bert-base-finetuned-sem_eval-sun  drive  sample_data


In [None]:
# clean up drive
!rm -r bert-base-finetuned-sem_eval-sun/run*/

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Roc Auc,Accuracy,Jaccard
1,No log,1.020051,0.596386,0.278436,0.74733,0.172662,0.479436
2,No log,1.00262,0.604782,0.172973,0.728014,0.460432,0.598321
3,No log,0.996255,0.618234,0.184055,0.734825,0.471223,0.611811
4,No log,0.993473,0.62166,0.198083,0.737716,0.467626,0.611811


TrainOutput(global_step=84, training_loss=0.9990363348098028, metrics={'train_runtime': 133.9415, 'train_samples_per_second': 19.292, 'train_steps_per_second': 0.627, 'total_flos': 169975846121472.0, 'train_loss': 0.9990363348098028, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.0200505256652832,
 'eval_f1_micro': 0.5963855421686747,
 'eval_f1_macro': 0.278435733194279,
 'eval_roc_auc': 0.7473302570863547,
 'eval_accuracy': 0.17266187050359713,
 'eval_jaccard': 0.47943645083932845,
 'eval_runtime': 1.7823,
 'eval_samples_per_second': 155.98,
 'eval_steps_per_second': 5.05,
 'epoch': 4.0}

In [None]:
from pathlib import Path
# save model
Path(f"/content/drive/MyDrive/data/semeval2025-11/{lang}/").mkdir(parents=True, exist_ok=True)
trainer.save_model(f"/content/drive/MyDrive/data/semeval2025-11/{lang}/bert-base-custom-loss/")

In [None]:
dev_df = pd.read_csv(f"/content/drive/MyDrive/data/semeval2025-11/dataset/track_a/dev/{lang}.csv")

import numpy as np
def predict_labels(text):
  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(model.device) for k,v in encoding.items()}

  outputs = model(**encoding)
  logits = outputs.logits
  # apply sigmoid + threshold
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(logits.squeeze().cpu())
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs >=0.5)] = 1
  """
  if no labels with probs 0.5, we take the max as the assigned label
  """
  if sum(predictions)==0:
    predictions[probs.tolist().index(max(probs.tolist()))]=1
  predictions = [int(item) for item in predictions]
  # turn predicted id's into actual label names
  predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1]
  # print(predicted_labels, predictions)
  return predicted_labels, predictions

results = []
texts = dev_df['text'].tolist()
ids = dev_df['id'].tolist()
for text,id in zip(texts,ids):
  _,predictions = predict_labels(text)
  result = dict(zip(labels, predictions))
  result['id'] = id
  results.append(result)

In [None]:
result_df = pd.DataFrame(results)
#reconstruct to id,Joy,Anger,Sadness,Surprise,Fear
result_df = result_df[['id','anger', 'fear', 'joy', 'sadness', 'surprise']]

Path("/content/drive/MyDrive/semeval2025-results/bert-base-custom-loss/track_a/").mkdir(parents=True, exist_ok=True)
result_df.to_csv(f'/content/drive/MyDrive/semeval2025-results/bert-base-custom-loss/track_a/pred_{lang}.csv',index=False)

In [None]:
result_df.head()

Unnamed: 0,id,anger,fear,joy,sadness,surprise
0,sun_dev_track_a_00001,0,0,0,1,1
1,sun_dev_track_a_00002,0,0,0,1,1
2,sun_dev_track_a_00003,0,0,0,1,1
3,sun_dev_track_a_00004,0,0,0,1,1
4,sun_dev_track_a_00005,0,0,0,1,1
