In [2]:
import transformers
import torch
import datasets
import numpy as np
import onnxruntime

import segmentador

In [32]:
print(", ".join(datasets.list_metrics()))

accuracy, bertscore, bleu, bleurt, cer, chrf, code_eval, comet, competition_math, coval, cuad, f1, gleu, glue, google_bleu, indic_glue, matthews_correlation, mauve, meteor, pearsonr, precision, recall, rouge, sacrebleu, sari, seqeval, spearmanr, squad, squad_v2, super_glue, ter, wer, wiki_split, xnli


In [3]:
seg_model = segmentador.Segmenter(local_files_only=True)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [5]:
df_tokenized_split = datasets.load_from_disk("../data/df_tokenized_split")
df_tokenized_split

DatasetDict({
    train: Dataset({
        features: ['id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15993
    })
    eval: Dataset({
        features: ['id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1999
    })
    test: Dataset({
        features: ['id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [6]:
metric_seqeval = datasets.load_metric("seqeval", cache_dir="../cache/metrics")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

save_steps = int(df_tokenized_split["train"].num_rows * 0.10)

training_args = transformers.TrainingArguments(
    output_dir="../segmenter_checkpoint",
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=save_steps,
    eval_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    max_grad_norm=1.0,
    num_train_epochs=5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=5e-3,
)

data_collator = transformers.DataCollatorForTokenClassification(
    seg_model.tokenizer,
    pad_to_multiple_of=8,
)

trainer = transformers.Trainer(
    model=seg_model.model,
    tokenizer=seg_model.tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=df_tokenized_split["train"],
    eval_dataset=df_tokenized_split["eval"],
    compute_metrics=compute_metrics,
)

Using amp half precision backend


In [8]:
# trainer.train()

In [9]:
pt_save_directory = "../pretrained_segmenter_model"
seg_model.save_pretrained(pt_save_directory)

Configuration saved in ../pretrained_segmenter_model/config.json
Model weights saved in ../pretrained_segmenter_model/pytorch_model.bin
tokenizer config file saved in ../pretrained_segmenter_model/tokenizer_config.json
Special tokens file saved in ../pretrained_segmenter_model/special_tokens_map.json
