In [1]:
import transformers
import torch
import torch.nn
import datasets
import numpy as np
import onnxruntime

import segmentador

USE_FP16 = False
NUM_LAYERS = 2
DEVICE = "cuda"
LOCAL_FILES_ONLY = True
NUM_TRAIN_EPOCHS = 2
GRAD_ACCUMULATION_STEPS = 16
TRAINED_MODEL_SAVE_PATH = "../pretrained_segmenter_model"

In [None]:
print(", ".join(datasets.list_metrics()))

In [2]:
for uri in [TRAINED_MODEL_SAVE_PATH, "neuralmind/bert-base-portuguese-cased"]:
    try:
        seg_model = segmentador.Segmenter(
            local_files_only=LOCAL_FILES_ONLY,
            device=DEVICE,
            uri_model=uri,
            uri_tokenizer="../tokenizers",
        )
        print(f"Loaded model from {uri}.")
        break
        
    except TypeError as e:
        print(e)
        continue

seg_model.model

Loaded model from ../pretrained_segmenter_model.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(6000, 768)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [3]:
seg_model.model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 2,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 6000
}

In [4]:
df_tokenized_split = datasets.load_from_disk("../data/df_tokenized_split_90001_120000")
df_tokenized_split

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25870
    })
    eval: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3234
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3234
    })
})

In [13]:
metric_seqeval = datasets.load_metric("seqeval", cache_dir="../cache/metrics")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    true_predictions = [
        [
            pp for (pp, ll) in zip(p, l) if ll != -100
        ]
        for (p, l) in zip(predictions, labels)
    ]
    
    true_labels = [[ll for ll in l if ll != -100] for l in labels]
    
    return metric_seqeval.compute(predictions=true_predictions, references=true_labels)


save_steps = int(df_tokenized_split["train"].num_rows / GRAD_ACCUMULATION_STEPS * 0.10)


training_args = transformers.TrainingArguments(
    output_dir="../segmenter_checkpoint",
    fp16=USE_FP16 and torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=save_steps,
    eval_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    learning_rate=5e-4,
    max_grad_norm=1.0,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=0.0,
    report_to="all",
)

data_collator = transformers.DataCollatorForTokenClassification(
    seg_model.tokenizer,
    pad_to_multiple_of=8 if USE_FP16 else 1,
)

trainer = transformers.Trainer(
    model=seg_model.model,
    tokenizer=seg_model.tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=df_tokenized_split["train"],
    eval_dataset=df_tokenized_split["eval"],
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices


In [6]:
trainer.train()

***** Running training *****
  Num examples = 25870
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 1616


Step,Training Loss,Validation Loss,100,Unnamed: 4,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
161,0.118,0.038035,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.362965
322,0.0345,0.031536,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.363848
483,0.0295,0.029496,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.364001
644,0.0276,0.027714,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.364286
805,0.0263,0.025121,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.364566
966,0.0231,0.025644,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.364671
1127,0.0221,0.023848,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.364764
1288,0.0215,0.022456,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.364876
1449,0.0205,0.021835,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.364969
1610,0.0203,0.021497,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 207437}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204203}",0.0,0.0,0.0,0.365002


***** Running Evaluation *****
  Num examples = 3234
  Batch size = 4
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../segmenter_checkpoint/checkpoint-161
Configuration saved in ../segmenter_checkpoint/checkpoint-161/config.json
Model weights saved in ../segmenter_checkpoint/checkpoint-161/pytorch_model.bin
tokenizer config file saved in ../segmenter_checkpoint/checkpoint-161/tokenizer_config.json
Special tokens file saved in ../segmenter_checkpoint/checkpoint-161/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3234
  Batch size = 4
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../segmenter_checkpoint/checkpoint-322
Configuration saved in ../segmenter_checkpoint/checkpoint-322/config.json
Model weights saved in ../segmenter_checkpoint/checkpoint-322/pytorch_model.bin
tokenizer config

  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../segmenter_checkpoint/checkpoint-644
Configuration saved in ../segmenter_checkpoint/checkpoint-644/config.json
Model weights saved in ../segmenter_checkpoint/checkpoint-644/pytorch_model.bin
tokenizer config file saved in ../segmenter_checkpoint/checkpoint-644/tokenizer_config.json
Special tokens file saved in ../segmenter_checkpoint/checkpoint-644/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3234
  Batch size = 4
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../segmenter_checkpoint/checkpoint-805
Configuration saved in ../segmenter_checkpoint/checkpoint-805/config.json
Model weights saved in ../segmenter_checkpoint/checkpoint-805/pytorch_model.bin
tokenizer config file saved in ../segmenter_checkpoint/checkpoint-805/tokenizer_config.json
Special tokens file saved in ../segmenter_checkpo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../segmenter_checkpoint/checkpoint-1288
Configuration saved in ../segmenter_checkpoint/checkpoint-1288/config.json
Model weights saved in ../segmenter_checkpoint/checkpoint-1288/pytorch_model.bin
tokenizer config file saved in ../segmenter_checkpoint/checkpoint-1288/tokenizer_config.json
Special tokens file saved in ../segmenter_checkpoint/checkpoint-1288/special_tokens_map.json
Deleting older checkpoint [../segmenter_checkpoint/checkpoint-483] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 3234
  Batch size = 4
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../segmenter_checkpoint/checkpoint-1449
Configuration saved in ../segmenter_checkpoint/checkpoint-1449/config.json
Model weights saved in ../segmenter_checkpoint/checkpoint-1449/

TrainOutput(global_step=1616, training_loss=0.034285864553017784, metrics={'train_runtime': 5283.6887, 'train_samples_per_second': 9.792, 'train_steps_per_second': 0.306, 'total_flos': 3189391672724352.0, 'train_loss': 0.034285864553017784, 'epoch': 2.0})

In [7]:
seg_model.save_pretrained(TRAINED_MODEL_SAVE_PATH)

Configuration saved in ../pretrained_segmenter_model/config.json
Model weights saved in ../pretrained_segmenter_model/pytorch_model.bin
tokenizer config file saved in ../pretrained_segmenter_model/tokenizer_config.json
Special tokens file saved in ../pretrained_segmenter_model/special_tokens_map.json


In [14]:
y_test_preds = trainer.predict(df_tokenized_split["test"])

***** Running Prediction *****
  Num examples = 3234
  Batch size = 4


  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [None]:
## Original tokenizer
"""
{'test_loss': 1.5868496894836426,
 'test_100': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 214714},
 'test__': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 211480},
 'test_overall_precision': 0.0,
 'test_overall_recall': 0.0,
 'test_overall_f1': 0.0,
 'test_overall_accuracy': 0.047600929576376,
 'test_runtime': 202.4429,
 'test_samples_per_second': 15.975,
 'test_steps_per_second': 3.996}
"""

In [None]:
y_test_preds.metrics