In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from transformers import (AutoTokenizer, TrainingArguments, EvalPrediction, AutoModelForSequenceClassification)

import focal_loss as fl

In [None]:
transformer_model = 'distilbert/distilbert-base-multilingual-cased'
# transformer_model = 'NbAiLab/nb-bert-base'
# transformer_model = 'NbAiLab/nb-bert-large'

dataset_name = 'reviews-with-genre'
model_name = 'reviews-with-genre'

In [None]:
dataset = load_from_disk(f'../datasets/{dataset_name}')['train']
dataset.cleanup_cache_files()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(transformer_model)

In [None]:
all_labels = set()
for row_labels in dataset['labels']:
    all_labels.update(row_labels)

id2label = {idx: label for idx, label in enumerate(all_labels)}
label2id = {label: idx for idx, label in enumerate(all_labels)}

In [None]:
def preprocess(examples):
    labels = [0.0] * len(id2label)
    for k, l in id2label.items():
        if l in examples['labels']:
            labels[k] = 1.0
        else:
            labels[k] = 0.0

    result = tokenizer(examples['text'], padding='max_length', max_length=128, truncation=True)
    result['labels'] = labels

    return result

In [None]:
tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names).train_test_split(
    test_size=0.2,
    seed=42)

In [None]:
example = tokenized_dataset['train'][0]
example.keys()

In [None]:
tokenizer.decode(example['input_ids'])

In [None]:
example['labels']

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(transformer_model,
                                                           problem_type='multi_label_classification',
                                                           num_labels=len(all_labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

train_args = TrainingArguments(
    f'../models/{model_name}',
    auto_find_batch_size=True,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    warmup_ratio=0.1
)

In [None]:
metric = evaluate.load('f1')

In [None]:
def compute_metrics(p: EvalPrediction):
    predictions = np.array([np.where(pred > 0, 1, 0) for pred in p.predictions]).flatten()
    references = np.array(p.label_ids, dtype=int).flatten()

    micro = metric.compute(predictions=predictions, references=references, average="micro")['f1']
    macro = metric.compute(predictions=predictions, references=references, average="macro")['f1']
    weighted = metric.compute(predictions=predictions, references=references, average="weighted")['f1']

    result = {
        'f1 micro': micro,
        'f1 macro': macro,
        'f1 weighted': weighted
    }

    return result

In [None]:
trainer = fl.FocalLossTrainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()