# Masked language modeling (MLM) para textos radiológicos

**Objetivo**: Treinar um modelo Masked language modeling (MLM) para textos radiológicos para verificar performance e futuras aplicações.

In [None]:
!pip install transformers
!pip install datasets

# Importanto Bibliotecas

In [35]:
import transformers
from datasets import load_dataset
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, BertForTokenClassification
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from torch.nn import functional as F
from transformers import DataCollatorForLanguageModeling

# Pré-processando o texto

In [3]:
checkpoint = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)

In [4]:
datasets = load_dataset("csv", data_files={"train": '/content/drive/MyDrive/train.csv', "validation": '/content/drive/MyDrive/valid.csv'})

Using custom data configuration default-fd164f70e2c67d35
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
max_seq_length = 512
num_proc = 4

def tokenize_function(examples):
    examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
    )
  
tokenized_dataset = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=["text"],
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-76eba31974506314.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-a9fe965a0df55ea2.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-d6f1d85486d5b187.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-88442e9908ff4602.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-11f153a31a34beaf.arrow
Loadi

In [6]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop,
    # you can customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Note that with `batched=True`, this map processes 1,000 texts together,
# so group_texts throws away a remainder for each of those groups of 1,000 texts.
# You can adjust that batch_size here but a higher value might be slower to preprocess.

tokenized_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    num_proc=num_proc,
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-ab7a6316d3a88d94.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-fb916397e1d6135d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-10d7c5d34336d917.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-aeca663d70a1d421.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-fd164f70e2c67d35/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-6d3bf2212d9088b9.arrow
Loadi

In [7]:
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
model_name = checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-CTReports",
    evaluation_strategy = "epoch",
    logging_strategy= "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# Treinando o modelo

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 1122
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 423


Epoch,Training Loss,Validation Loss
1,1.8286,0.952228
2,0.9648,0.813143
3,0.8779,0.781739


***** Running Evaluation *****
  Num examples = 553
  Batch size = 8
***** Running Evaluation *****
  Num examples = 553
  Batch size = 8
***** Running Evaluation *****
  Num examples = 553
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=423, training_loss=1.2237616103873468, metrics={'train_runtime': 884.4963, 'train_samples_per_second': 3.806, 'train_steps_per_second': 0.478, 'total_flos': 885939872108544.0, 'train_loss': 1.2237616103873468, 'epoch': 3.0})

In [13]:
trainer.save_model('/content/drive/MyDrive/' + f"{model_name}-finetuned-MLM-Signs")

Saving model checkpoint to /content/drive/MyDrive/bert-base-portuguese-cased-finetuned-MLM-Signs
Configuration saved in /content/drive/MyDrive/bert-base-portuguese-cased-finetuned-MLM-Signs/config.json
Model weights saved in /content/drive/MyDrive/bert-base-portuguese-cased-finetuned-MLM-Signs/pytorch_model.bin


In [30]:
model_sign = AutoModelForMaskedLM.from_pretrained("/content/drive/MyDrive/bert-base-portuguese-cased-finetuned-MLM-Signs")

loading configuration file /content/drive/MyDrive/bert-base-portuguese-cased-finetuned-MLM-Signs/config.json
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 29794
}

loa

In [33]:
pipe = pipeline('fill-mask', model=model_sign, tokenizer=tokenizer)

pipe('aorta [MASK] de calibre mantido')

[{'score': 0.14783015847206116,
  'sequence': 'aorta direita de calibre mantido',
  'token': 5065,
  'token_str': 'direita'},
 {'score': 0.13071046769618988,
  'sequence': 'aorta : de calibre mantido',
  'token': 131,
  'token_str': ':'},
 {'score': 0.09723420441150665,
  'sequence': 'aorta dorsal de calibre mantido',
  'token': 21966,
  'token_str': 'dorsal'},
 {'score': 0.05238368362188339,
  'sequence': 'aorta esquerda de calibre mantido',
  'token': 4573,
  'token_str': 'esquerda'},
 {'score': 0.048720721155405045,
  'sequence': 'aorta cardíaca de calibre mantido',
  'token': 20146,
  'token_str': 'cardíaca'}]