In [1]:
import transformers
import torch
import torch.nn
import datasets
import numpy as np
import sklearn.metrics
# import onnxruntime

import segmentador


TRAIN_MODEL = False
PREDICT_TEST_SET_AT_END = True
DEBUG_RUN = False


USE_FP16 = False
DEVICE = "cuda"
LOCAL_FILES_ONLY = True
LOAD_PRETRAINED_WEIGHTS = True
NUM_TRAIN_EPOCHS = 4
GRAD_ACCUMULATION_STEPS = 16
TRAINED_MODEL_SAVE_PATH = "../pretrained_segmenter_model"
TRAINER_STATE_SAVE_PATH = "../saved_trainer_states"

In [2]:
MODEL_URIS = {
    "trained_model": (TRAINED_MODEL_SAVE_PATH, True),
    "pretrained_bertimbau_hugginface_hub": ("neuralmind/bert-base-portuguese-cased", False),
    "pretrained_bertimbau_local": ("../base_models/bert-base-portuguese-cased", False),
}


if LOCAL_FILES_ONLY:
    MODEL_URIS.pop("pretrained_bertimbau_hugginface_hub")
    

for uri_tag, (uri_model, init_from_pretrained) in MODEL_URIS.items():
    try:
        seg_model = segmentador.Segmenter(
            local_files_only=LOCAL_FILES_ONLY,
            device=DEVICE,
            uri_model=uri_model,
            init_from_pretrained_weights=init_from_pretrained,
            uri_tokenizer="../tokenizers",
        )
        print(f"Loaded model from ({uri_tag}, {uri_model}).")
        break
        
    except TypeError as e:
        print(e)
        continue

        
seg_model.model

Loaded model from (trained_model, ../pretrained_segmenter_model).


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(6000, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [3]:
seg_model.model.config

BertConfig {
  "_name_or_path": "../pretrained_segmenter_model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",


In [4]:
df_tokenized_split = datasets.load_from_disk("../data/df_tokenized_split_0_120000")
df_tokenized_split

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 114884
    })
    eval: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14361
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14361
    })
})

In [5]:
def compute_metrics(eval_pred):
    pred_logits, labels = eval_pred
    predictions = np.argmax(pred_logits, axis=-1)
    
    true_predictions = [
        pp
        for (p, l) in zip(predictions, labels)
        for (pp, ll) in zip(p, l) if ll != -100
    ]
    
    true_labels = [
        ll
        for l in labels
        for ll in l if ll != -100
    ]
    
    conf_mat = sklearn.metrics.confusion_matrix(
        y_true=true_labels,
        y_pred=true_predictions,
        labels=(0, 1, 2, 3),
    )
    
    per_cls_precision = conf_mat.diagonal() / (1e-8 + conf_mat.sum(axis=0))
    per_cls_recall = conf_mat.diagonal() / (1e-8 + conf_mat.sum(axis=1))
    
    macro_precision = float(np.mean(per_cls_precision))
    macro_recall = float(np.mean(per_cls_recall))
    macro_f1 = 2.0 * macro_precision * macro_recall / (1e-8 + macro_precision + macro_recall)
    
    overall_accuracy = float(np.sum(conf_mat.diagonal())) / len(true_labels)
    
    res = {
        **{f"per_cls_precision_{cls_i}": score for cls_i, score in enumerate(per_cls_precision)},
        **{f"per_cls_recall_{cls_i}": score for cls_i, score in enumerate(per_cls_recall)},
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "overall_accuracy": overall_accuracy,
    }
    
    return res


save_steps = int(df_tokenized_split["train"].num_rows / GRAD_ACCUMULATION_STEPS * 0.10)


training_args = transformers.TrainingArguments(
    output_dir="../segmenter_checkpoint",
    fp16=USE_FP16 and torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=save_steps,
    eval_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    learning_rate=5e-4,
    max_grad_norm=1.0,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=0.0,
    report_to="all",
)

data_collator = transformers.DataCollatorForTokenClassification(
    seg_model.tokenizer,
    pad_to_multiple_of=8 if USE_FP16 else 1,
)

trainer = transformers.Trainer(
    model=seg_model.model,
    tokenizer=seg_model.tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=df_tokenized_split["train"],
    eval_dataset=df_tokenized_split["eval"],
    compute_metrics=compute_metrics,
)

In [6]:
if TRAIN_MODEL and not DEBUG_RUN:
    trainer.train(resume_from_checkpoint=TRAINER_STATE_SAVE_PATH)
    
    trainer.save_model(TRAINED_MODEL_SAVE_PATH)
    trainer.save_state(TRAINER_STATE_SAVE_PATH)

In [7]:
if PREDICT_TEST_SET_AT_END:
    predict_split = "test"
    
    if DEBUG_RUN:
        test_subset = df_tokenized_split[predict_split].shard(num_shards=50, index=0)
        
    else:
        test_subset = df_tokenized_split[predict_split]
        
    y_test_preds = trainer.predict(test_subset)
    
    print(y_test_preds.metrics)
    
    trainer.save_metrics(split=predict_split, metrics=y_test_preds.metrics)

***** Running Prediction *****
  Num examples = 14361
  Batch size = 8


{'test_loss': 0.0039053994696587324, 'test_per_cls_precision_0': 0.9994245141056332, 'test_per_cls_precision_1': 0.9847286415399228, 'test_per_cls_precision_2': 0.9560322666421806, 'test_per_cls_precision_3': 0.8218331616847486, 'test_per_cls_recall_0': 0.999482910931559, 'test_per_cls_recall_1': 0.9873317939325593, 'test_per_cls_recall_2': 0.9505552617654234, 'test_per_cls_recall_3': 0.639679358714871, 'test_macro_precision': 0.9405046459931213, 'test_macro_recall': 0.8942623313361033, 'test_macro_f1': 0.9168007525386421, 'test_overall_accuracy': 0.9988911708635777, 'test_runtime': 1790.3335, 'test_samples_per_second': 8.021, 'test_steps_per_second': 1.003}
