In [None]:
import os

import transformers
import torch
import torch.nn
import datasets
import numpy as np
import sklearn.metrics
# import onnxruntime

import segmentador


TRAIN_MODEL = False
PREDICT_TEST_SET_AT_END = True
DEBUG_RUN = True
RESOURCE_DIR = ".."

USE_FP16 = False
DEVICE = "cuda"
LOCAL_FILES_ONLY = True
LOAD_PRETRAINED_WEIGHTS = True
NUM_TRAIN_EPOCHS = 4
NUM_HIDDEN_LAYERS = 6
GRAD_ACCUMULATION_STEPS = 16
TRAINED_MODEL_SAVE_PATH = os.path.join(RESOURCE_DIR, "pretrained_segmenter_model")
TRAINER_STATE_SAVE_PATH = os.path.join(RESOURCE_DIR, "saved_trainer_states")

In [None]:
MODEL_URIS = {
    "trained_model": (TRAINED_MODEL_SAVE_PATH, True),
    "pretrained_bertimbau_hugginface_hub": ("neuralmind/bert-base-portuguese-cased", False),
    "pretrained_bertimbau_local": ("../base_models/bert-base-portuguese-cased", False),
}


if LOCAL_FILES_ONLY:
    MODEL_URIS.pop("pretrained_bertimbau_hugginface_hub")
    

for uri_tag, (uri_model, init_from_pretrained) in MODEL_URIS.items():
    try:
        seg_model = segmentador.Segmenter(
            local_files_only=LOCAL_FILES_ONLY,
            device=DEVICE,
            uri_model=uri_model,
            init_from_pretrained_weights=init_from_pretrained,
            uri_tokenizer=os.path.join(RESOURCE_DIR, "tokenizers"),
            num_hidden_layers=NUM_HIDDEN_LAYERS,
        )
        print(f"Loaded model from ({uri_tag}, {uri_model}).")
        break
        
    except TypeError as e:
        print(e)
        continue

        
seg_model.model

In [None]:
seg_model.model.config

In [None]:
df_tokenized_split = datasets.load_from_disk(os.path.join(RESOURCE_DIR, "data/df_tokenized_split_0_120000"))
df_tokenized_split

In [None]:
def compute_metrics(eval_pred):
    pred_logits, labels = eval_pred
    predictions = np.argmax(pred_logits, axis=-1)
    
    true_predictions = [
        pp
        for (p, l) in zip(predictions, labels)
        for (pp, ll) in zip(p, l) if ll != -100
    ]
    
    true_labels = [
        ll
        for l in labels
        for ll in l if ll != -100
    ]
    
    conf_mat = sklearn.metrics.confusion_matrix(
        y_true=true_labels,
        y_pred=true_predictions,
        labels=(0, 1, 2, 3),
    )
    
    per_cls_precision = conf_mat.diagonal() / (1e-8 + conf_mat.sum(axis=0))
    per_cls_recall = conf_mat.diagonal() / (1e-8 + conf_mat.sum(axis=1))
    
    macro_precision = float(np.mean(per_cls_precision))
    macro_recall = float(np.mean(per_cls_recall))
    macro_f1 = 2.0 * macro_precision * macro_recall / (1e-8 + macro_precision + macro_recall)
    
    overall_accuracy = float(np.sum(conf_mat.diagonal())) / len(true_labels)
    
    res = {
        **{f"per_cls_precision_{cls_i}": score for cls_i, score in enumerate(per_cls_precision)},
        **{f"per_cls_recall_{cls_i}": score for cls_i, score in enumerate(per_cls_recall)},
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "overall_accuracy": overall_accuracy,
    }
    
    return res


save_steps = int(df_tokenized_split["train"].num_rows / GRAD_ACCUMULATION_STEPS * 0.10)


training_args = transformers.TrainingArguments(
    output_dir=os.path.join(RESOURCE_DIR, "segmenter_checkpoint"),
    logging_dir=os.path.join(RESOURCE_DIR, "loggings"),
    fp16=USE_FP16 and torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=save_steps,
    eval_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    learning_rate=5e-4,
    max_grad_norm=1.0,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=0.0,
    report_to="all",
)

data_collator = transformers.DataCollatorForTokenClassification(
    seg_model.tokenizer,
    pad_to_multiple_of=8 if USE_FP16 else 1,
)

if DEBUG_RUN:
    df_tokenized_split["eval"] = df_tokenized_split["eval"].shard(num_shards=500, index=0)

trainer = transformers.Trainer(
    model=seg_model.model,
    tokenizer=seg_model.tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=df_tokenized_split["train"],
    eval_dataset=df_tokenized_split["eval"],
    compute_metrics=compute_metrics,
)

In [None]:
if TRAIN_MODEL and not DEBUG_RUN:
    train_results = trainer.train(resume_from_checkpoint=TRAINER_STATE_SAVE_PATH)
    
    train_metrics = train_results.metrics
    trainer.log_metrics(split="train", metrics=train_metrics)
    trainer.save_metrics(split="train", metrics=train_metrics)
    
    trainer.save_model(TRAINED_MODEL_SAVE_PATH)
    trainer.save_state(TRAINER_STATE_SAVE_PATH)

In [None]:
if PREDICT_TEST_SET_AT_END:
    predict_split = "test"
    test_subset = df_tokenized_split[predict_split]
    
    if DEBUG_RUN:
        test_subset = test_subset.shard(num_shards=500, index=0)
        
    test_metrics = trainer.evaluate(test_subset)
    
    trainer.log_metrics(split=predict_split, metrics=test_metrics)
    trainer.save_metrics(split=predict_split, metrics=test_metrics)