In [1]:
import os
import random
import numpy as np
import pickle

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import matplotlib.pyplot as plt

In [2]:
import re
import spacy

import nltk
import string
#nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tiagolima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense

import torch
from datasets import Dataset
    
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments

# Verificar se há uma GPU disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Verificar a quantidade de GPUs disponíveis
if torch.cuda.is_available():
    print(f"Número de GPUs disponíveis: {torch.cuda.device_count()}")
    print(f"Nome da GPU: {torch.cuda.get_device_name(0)}")

2024-07-29 13:38:35.287445: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-29 13:38:35.315939: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Usando dispositivo: cuda
Número de GPUs disponíveis: 1
Nome da GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU


In [4]:
#!pip install torch

In [5]:
# from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
# import numpy as np
# import torch

# # Carregar o tokenizador
# tokenizer = BertTokenizerFast.from_pretrained('neuralmind/bert-base-portuguese-cased')


In [6]:
bio_files_dir = '../data/bio_json_data'
bio_files = [os.path.join(bio_files_dir, f) for f in os.listdir('../data/bio_json_data') if f.endswith('.bio')]

In [7]:
print(f"The number of .bio files is {len(bio_files)}")

The number of .bio files is 285


In [8]:
import sys
sys.path.append('../')
from config import entity_to_acronyms, acronyms_to_entities, MAX_LENGTH #, label_to_index, index_to_label

In [9]:
for bio_file in bio_files:
    with open(bio_file, "r", encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                continue
            
            word, tag = line.strip().split('\t')
            if word in STOP_WORDS and tag.startswith('B'):
                print(line)
            # print(line)

In [10]:
def clean_word(word):
    
    # remove non-alphanumeric characters and extra whitespaces
    word = re.sub(r'[^\w\s]','',word)
    word = re.sub(r'\s+',' ',word)
    
    # convert to lowercase
    word = word.lower()
    
    if word not in STOP_WORDS:
        return word
    
    return ''

In [11]:
# Load a pre-trained Spacy model and set the stop words
nlp = spacy.load('pt_core_news_sm')

def clean_word(word):
    # remove non-alphanumeric characters and extra whitespaces
    word = re.sub(r'[^\w\s]','',word)
    word = re.sub(r'\s+',' ',word)
    
    # convert to lowercase
    word = word.lower()

    try:
        # lemmatize the word
        lemma = nlp(word)[0].lemma_
        
        # check if the lemma is a stop word
        if lemma not in STOP_WORDS:
            return lemma
    except Exception as e:
        pass
    
    return ''


In [12]:

def parse_data_from_file(bio_file):
    """
    Reads a file in BIO format (one token per line, with tab-separated word and tag),
    and extracts the sentences and labels as lists of lists. Each inner list represents
    a sentence, and contains the words of the sentence in order. Each corresponding inner
    list in the 'labels' list contains the BIO tags for the words in the corresponding
    sentence, in the same order.
    
    Args:
    - bio_file (str): the path to the BioNLP file to read
    
    Returns:
    - A tuple containing:
        - sentences (List[List[str]]): a list of lists, where each inner list represents
          a sentence and contains the words of the sentence in order
        - labels (List[List[str]]): a list of lists, where each inner list corresponds
          to a sentence in the 'sentences' list and contains the BIO tags for the words
          in the corresponding sentence, in the same order.
    """
    sentences = []
    labels = []
    
    with open(bio_file, "r", encoding='utf-8') as f:
        
        current_sentences = []
        current_labels = []
        
        counter = 0
        
        for line in f:
            
            counter += 1
            if line.strip() == '':
                # If we encounter a blank line, it means we've reached the end of a sentence
                if len(current_sentences) > 0:
                    # print(current_sentences)
                    
                    # Add the current sentence and labels to the list
                    sentences.append(current_sentences)
                    labels.append(current_labels)
                    
                    # Reset the current sentence and labels lists
                    current_sentences = []
                    current_labels = []
                    continue
                    
            word, tag = line.strip().split('\t')
            word = clean_word(word)
            
            if word.strip():
                current_sentences.append(word)
                # print(current_sentences)
                if len(current_labels) > 0:
                    if tag[2:] == current_labels[-1][2:] and tag[:2] == "B-":
                        tag = f"I-{tag[2:]}"
                current_labels.append(tag)

        # print('counter', counter)
        if counter > 0:
            # print(len(current_sentences))
            # Add the current sentence and labels to the list
            sentences.append(current_sentences)
            labels.append(current_labels)
        
            current_sentences = []
            current_labels = []
            
    # print(sentences, labels)
    # for (sentence, label) in zip(sentences, labels):
    #     print(sentence, label)
    #     print('*' * 50)
    return sentences, labels

In [13]:
def parse_bio_files(bio_files):
    
    sentences = []
    labels = []
    
    for idx, bio_file in enumerate(bio_files):
        
        curr_sentences, curr_labels = parse_data_from_file(bio_file)
        
        if len(curr_sentences) > 0:
            sentences.extend(curr_sentences)
            labels.extend(curr_labels)
            
        # if (idx+1) % 20 == 0:
        #     print(f'{idx+1} completed')

    return sentences, labels

In [14]:
sentences, labels = parse_bio_files(bio_files)

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
print(f"Dataset contains {len(sentences)} examples\n")
# print(labels)

In [None]:
# print(labels)

In [None]:
# label_map = {label: i for i, label in enumerate(labels)}
# print(label_map)

In [20]:
combined = list(zip(sentences, labels))
random.shuffle(combined)
sentences[:], labels[:] = zip(*combined)

In [21]:
from sklearn.model_selection import train_test_split


unique_labels = set(element for sublist in labels for element in sublist)
label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
index_to_label = {id: label for label, id in label_to_index.items()}

### Add the new label and ID to the dictionaries
label_to_index['O'] = 0
index_to_label[0] = 'O'

# Função para substituir labels pelos índices
def replace_labels_with_indices(labels, label_to_index):
    return [[label_to_index[label] for label in sublist] for sublist in labels]

# Substituir as labels pelos índices
indexed_labels = replace_labels_with_indices(labels, label_to_index)
# print(indexed_labels)
# Empacotar sentenças e labels
data = list(zip(sentences, indexed_labels))

# Dividir os dados em treino e teste
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Desempacotar sentenças e labels
train_sentences, train_labels = zip(*train_data)
test_sentences, test_labels = zip(*test_data)

# Converter de volta para listas
train_sentences = list(train_sentences)
train_labels = list(train_labels)
test_sentences = list(test_sentences)
test_labels = list(test_labels)

# Exibir os conjuntos de treino e teste
print("Treinamento Sentences:", len(train_sentences))
print("Treinamento Labels:", len(train_labels))
print("Teste Sentences:", len(test_sentences))
print("Teste Labels:", len(test_labels))

Treinamento Sentences: 366
Treinamento Labels: 366
Teste Sentences: 92
Teste Labels: 92


In [22]:


# Carregar o tokenizer BERT pré-treinado
# tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Carregar o tokenizer rápido BERT pré-treinado para português
tokenizer = BertTokenizerFast.from_pretrained('neuralmind/bert-base-portuguese-cased')
# tokenizer.to(device)  # Mover o modelo para a GPU


def tokenize_and_align_labels(sentences, labels, tokenizer, max_length=128):
    tokenized_inputs = tokenizer(sentences, padding='max_length', truncation=True, max_length=max_length, is_split_into_words=True)
    new_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        
        new_labels.append(label_ids)
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

# Tokenizar e alinhar labels para treino e teste
train_inputs = tokenize_and_align_labels(train_sentences, train_labels, tokenizer, max_length=128)
test_inputs = tokenize_and_align_labels(test_sentences, test_labels, tokenizer, max_length=128)

# Converter para dataset do Hugging Face
train_dataset = Dataset.from_dict(train_inputs)
test_dataset = Dataset.from_dict(test_inputs)

# Carregar o modelo BERT pré-treinado para classificação de tokens
model = BertForTokenClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=len(set(label for sublist in labels for label in sublist)))
model.to(device)  # Mover o modelo para a GPU

# Definir a função de métricas
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=-1)
    labels = p.label_ids
    # Remover os rótulos de preenchimento para calcular métricas corretamente
    true_predictions = [
        [pred for pred, label in zip(prediction, label) if label != -100]
        for prediction, label in zip(preds, labels)
    ]
    true_labels = [
        [label for label in label_list if label != -100]
        for label_list in labels
    ]
    precision, recall, f1, _ = precision_recall_fscore_support(
        [l for sublist in true_labels for l in sublist],
        [p for sublist in true_predictions for p in sublist],
        average='weighted'
    )
    acc = accuracy_score(
        [l for sublist in true_labels for l in sublist],
        [p for sublist in true_predictions for p in sublist]
    )
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Configurar os parâmetros de treinamento
training_args = TrainingArguments(
    output_dir="./results_v3",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=30,
    weight_decay=0.03, #0.01
)

# Criar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Treinar o modelo
trainer.train()

# Avaliar o modelo
results = trainer.evaluate()
print("Resultados da Avaliação:", results)

trainer.save_model("./my_model")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.639681,0.368762,0.245219,0.368762,0.259557
2,No log,2.131641,0.445237,0.372857,0.445237,0.361943
3,No log,1.93182,0.506675,0.475987,0.506675,0.445913
4,No log,1.789081,0.546468,0.510608,0.546468,0.49687
5,No log,1.667607,0.593001,0.553339,0.593001,0.556776
6,1.979700,1.654075,0.605962,0.574363,0.605962,0.57516
7,1.979700,1.639859,0.614388,0.575999,0.614388,0.584686
8,1.979700,1.647853,0.616202,0.590342,0.616202,0.587732
9,1.979700,1.69608,0.62022,0.588117,0.62022,0.592255
10,1.979700,1.685435,0.617369,0.588278,0.617369,0.594199


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Resultados da Avaliação: {'eval_loss': 2.1486799716949463, 'eval_accuracy': 0.632145171743357, 'eval_precision': 0.6229526048658711, 'eval_recall': 0.632145171743357, 'eval_f1': 0.6214238202766369, 'eval_runtime': 0.6373, 'eval_samples_per_second': 144.358, 'eval_steps_per_second': 36.09, 'epoch': 30.0}


In [None]:
# Função para tokenizar e preparar o texto
def tokenize_and_prepare_texts(texts, tokenizer, max_length=128):
    tokenized_inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, is_split_into_words=True, return_tensors="pt")
    return tokenized_inputs


In [None]:
# Novo texto para validação

# new_text = ["Acesso venoso central em subclavia D duplolumen recebendo solução salina e glicosada em BI."]
new_text = ["Paciente com Sepse pulmonar em D8 tazocin (paciente não recebeu por 2 dias Atb)."]

# new_text = ["#exames laboratoriais lab (03/03/2021) hb 9.09/ ht 24.7/ leuco 12800/ plaq 131.000/ pcr 370/ su nitrito negativo, leuco 15 20/cp, 1 2 hemacias varias celulas epiteliais, varias bacterias #."]
new_text = [ "admissao paciente 16 anos, g1p0, admitida dia 21/12/21, no curso de 31.2 sem, encaminhada de gravata devido historia de perda de liquido amniotico claro desde as 11h do dia 21/12. negava comorbidades relatava boa movimentacao fetal apresenta registro em encaminhamento de realizacao de 1 dose de dexametasona em servico de origem as 16:40h."]
# Tokenizar o novo texto
tokenized_inputs = tokenize_and_prepare_texts(new_text, tokenizer)

# Mover os tensores para a GPU se disponível
tokenized_inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}

In [33]:
model.eval()  # Colocar o modelo em modo de avaliação

with torch.no_grad():
    outputs = model(**tokenized_inputs)

# Obter as previsões
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

# Inverter o dicionário de mapeamento para obter rótulos a partir dos índices
index_to_label = {idx: label for label, idx in label_to_index.items()}

# Obter as palavras do novo texto tokenizado
tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][0])

# Mapear as previsões para os rótulos
predicted_labels = [index_to_label[idx.item()] for idx in predictions[0]]

# Remover tokens especiais ([CLS], [SEP], [PAD])
tokens_labels = [(token, label) for token, label in zip(tokens, predicted_labels) if token not in ["[CLS]", "[SEP]", "[PAD]"]]

# Reconstruir palavras a partir dos tokens (tokens BERT podem ser subpalavras)
word_labels = []
current_word = ""
current_label = ""
for token, label in tokens_labels:
    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels.append((current_word, current_label))
        current_word = token
        current_label = label
if current_word:
    word_labels.append((current_word, current_label))

# Exibir as palavras do texto com suas respectivas previsões
for word, label in word_labels:
    print(f"{word}: {label}")

admissao: O
paciente: B-SUB
16: B-AGE
anos: I-AGE
,: O
g1p0: B-PREH
,: O
admitida: B-CLE
dia: O
21: B-TIM
/: B-TIM
12: B-TIM
/: B-TIM
21: B-TIM
,: O
no: O
curso: O
de: O
31: O
.: O
2: O
sem: O
,: O
encaminhada: B-CLE
de: B-DET
gravata: I-DET
devido: B-DET
historia: B-DID
de: B-DID
perda: B-BAT
de: B-BAT
liquido: B-BAT
amniotico: B-COL
claro: I-BAT
desde: O
as: B-TIM
11h: B-TIM
do: B-TIM
dia: B-TIM
21: B-TIM
/: B-TIM
12: B-TIM
.: O
negava: B-SIG
comorbidades: I-SIG
relatava: B-SIG
boa: B-SIG
movimentacao: I-SIG
fetal: I-SIG
apresenta: O
registro: O
em: O
encaminhamento: I-DET
de: I-DET
realizacao: I-DET
de: O
1: B-DOS
dose: I-DOS
de: B-MED
dexametasona: B-MED
em: O
servico: I-HIS
de: I-DET
origem: I-HIS
as: O
16: B-TIM
:: B-TIM
40h: B-TIM
.: O


In [34]:
sentences_index = np.arange(0, len(sentences))
labels_index = np.arange(0, len(labels))

# sentences = np.array(sentences)
# labels = np.array(labels)


# sss_train_val_test = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# train_index, val_test_index = next(sss_train_val_test.split(sentences, labels))


# print(train_index)




# # Primeiro, dividir em treinamento e (validação + teste)
# sss_train_val_test = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# train_index, val_test_index = next(sss_train_val_test.split(sentences, labels))

# X_train, X_val_test = sentences[train_index], sentences[val_test_index]
# y_train, y_val_test = labels[train_index], labels[val_test_index]

# # Em seguida, dividir (validação + teste) em validação e teste
# sss_val_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
# val_index, test_index = next(sss_val_test.split(X_val_test, y_val_test))

# X_val, X_test = X_val_test[val_index], X_val_test[test_index]
# y_val, y_test = y_val_test[val_index], y_val_test[test_index]

In [35]:
# Split the data into training, validation, and test sets

TEST_SIZE = 0.2

num_sentences = len(sentences)
num_train = int(num_sentences * (1 - TEST_SIZE - 0.1))
num_valid = int(num_sentences * 0.1)

In [36]:
train_sentences = sentences[:num_train]
train_labels = labels[:num_train]

valid_sentences = sentences[num_train:num_train+num_valid]
valid_labels = labels[num_train:num_train+num_valid]

test_sentences = sentences[num_train+num_valid:]
test_labels = labels[num_train+num_valid:]

In [37]:
unique_labels = set(element for sublist in labels for element in sublist)

In [38]:
label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
index_to_label = {id: label for label, id in label_to_index.items()}

### Add the new label and ID to the dictionaries
label_to_index['<PAD>'] = 0
index_to_label[0] = '<PAD>'

In [39]:
print(unique_labels)

{'B-SUB', 'B-BAT', 'B-HEI', 'I-PREG', 'I-QLTC', 'B-HIS', 'I-ARA', 'B-THP', 'B-PREH', 'I-TIM', 'I-NBL', 'B-OUT', 'B-OTE', 'B-FAM', 'B-DOS', 'B-BST', 'B-QLTC', 'I-ACT', 'B-ADM', 'I-WEI', 'I-AGE', 'B-PER', 'B-DID', 'I-DOS', 'I-ADM', 'I-MED', 'B-ACT', 'B-WEI', 'B-MED', 'I-PREH', 'B-SEX', 'I-DIS', 'B-OCC', 'I-OTE', 'I-QNTC', 'B-VOL', 'I-DET', 'I-VOL', 'I-DUR', 'I-SEV', 'B-OTH', 'I-MEDT', 'B-FRE', 'B-NBL', 'B-QNTC', 'B-MEDT', 'I-SIG', 'B-DIA', 'B-LAB', 'B-TIM', 'I-HEI', 'B-COL', 'I-OCC', 'B-DIS', 'I-FAM', 'B-DUR', 'I-COL', 'I-SUB', 'I-BAT', 'I-HIS', 'I-LAB', 'I-CLE', 'B-SEV', 'I-DIA', 'B-AGE', 'I-OTH', 'I-PER', 'B-SIG', 'B-CLE', 'I-DID', 'B-ARA', 'I-BST', 'I-FRE', 'I-DAT', 'I-OUT', 'O', 'B-DAT', 'B-DET', 'B-PREG', 'I-THP'}


In [40]:
print(label_to_index)
# label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
# index_to_label = {id: label for label, id in label_to_index.items()}

{'B-ACT': 1, 'B-ADM': 2, 'B-AGE': 3, 'B-ARA': 4, 'B-BAT': 5, 'B-BST': 6, 'B-CLE': 7, 'B-COL': 8, 'B-DAT': 9, 'B-DET': 10, 'B-DIA': 11, 'B-DID': 12, 'B-DIS': 13, 'B-DOS': 14, 'B-DUR': 15, 'B-FAM': 16, 'B-FRE': 17, 'B-HEI': 18, 'B-HIS': 19, 'B-LAB': 20, 'B-MED': 21, 'B-MEDT': 22, 'B-NBL': 23, 'B-OCC': 24, 'B-OTE': 25, 'B-OTH': 26, 'B-OUT': 27, 'B-PER': 28, 'B-PREG': 29, 'B-PREH': 30, 'B-QLTC': 31, 'B-QNTC': 32, 'B-SEV': 33, 'B-SEX': 34, 'B-SIG': 35, 'B-SUB': 36, 'B-THP': 37, 'B-TIM': 38, 'B-VOL': 39, 'B-WEI': 40, 'I-ACT': 41, 'I-ADM': 42, 'I-AGE': 43, 'I-ARA': 44, 'I-BAT': 45, 'I-BST': 46, 'I-CLE': 47, 'I-COL': 48, 'I-DAT': 49, 'I-DET': 50, 'I-DIA': 51, 'I-DID': 52, 'I-DIS': 53, 'I-DOS': 54, 'I-DUR': 55, 'I-FAM': 56, 'I-FRE': 57, 'I-HEI': 58, 'I-HIS': 59, 'I-LAB': 60, 'I-MED': 61, 'I-MEDT': 62, 'I-NBL': 63, 'I-OCC': 64, 'I-OTE': 65, 'I-OTH': 66, 'I-OUT': 67, 'I-PER': 68, 'I-PREG': 69, 'I-PREH': 70, 'I-QLTC': 71, 'I-QNTC': 72, 'I-SEV': 73, 'I-SIG': 74, 'I-SUB': 75, 'I-THP': 76, 'I-TIM': 7

In [41]:
print(index_to_label)
# # Add the new label and ID to the dictionaries
# label_to_index['<PAD>'] = 0
# index_to_label[0] = '<PAD>'

{1: 'B-ACT', 2: 'B-ADM', 3: 'B-AGE', 4: 'B-ARA', 5: 'B-BAT', 6: 'B-BST', 7: 'B-CLE', 8: 'B-COL', 9: 'B-DAT', 10: 'B-DET', 11: 'B-DIA', 12: 'B-DID', 13: 'B-DIS', 14: 'B-DOS', 15: 'B-DUR', 16: 'B-FAM', 17: 'B-FRE', 18: 'B-HEI', 19: 'B-HIS', 20: 'B-LAB', 21: 'B-MED', 22: 'B-MEDT', 23: 'B-NBL', 24: 'B-OCC', 25: 'B-OTE', 26: 'B-OTH', 27: 'B-OUT', 28: 'B-PER', 29: 'B-PREG', 30: 'B-PREH', 31: 'B-QLTC', 32: 'B-QNTC', 33: 'B-SEV', 34: 'B-SEX', 35: 'B-SIG', 36: 'B-SUB', 37: 'B-THP', 38: 'B-TIM', 39: 'B-VOL', 40: 'B-WEI', 41: 'I-ACT', 42: 'I-ADM', 43: 'I-AGE', 44: 'I-ARA', 45: 'I-BAT', 46: 'I-BST', 47: 'I-CLE', 48: 'I-COL', 49: 'I-DAT', 50: 'I-DET', 51: 'I-DIA', 52: 'I-DID', 53: 'I-DIS', 54: 'I-DOS', 55: 'I-DUR', 56: 'I-FAM', 57: 'I-FRE', 58: 'I-HEI', 59: 'I-HIS', 60: 'I-LAB', 61: 'I-MED', 62: 'I-MEDT', 63: 'I-NBL', 64: 'I-OCC', 65: 'I-OTE', 66: 'I-OTH', 67: 'I-OUT', 68: 'I-PER', 69: 'I-PREG', 70: 'I-PREH', 71: 'I-QLTC', 72: 'I-QNTC', 73: 'I-SEV', 74: 'I-SIG', 75: 'I-SUB', 76: 'I-THP', 77: 'I-TIM

In [42]:
NUM_CLASSES = len(index_to_label)

In [43]:
MAX_LENGTH = 100

train_labels = [[label_to_index[label] for label in labels] for labels in train_labels]
train_labels = pad_sequences(train_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
train_labels = to_categorical(train_labels, num_classes=NUM_CLASSES)

valid_labels = [[label_to_index[label] for label in labels] for labels in valid_labels]
valid_labels = pad_sequences(valid_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
valid_labels = to_categorical(valid_labels, num_classes=NUM_CLASSES)

test_labels = [[label_to_index[label] for label in labels] for labels in test_labels]
test_labels = pad_sequences(test_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
test_labels = to_categorical(test_labels, num_classes=NUM_CLASSES)

In [44]:
# Convert the input sentences to sequences of word indices
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)


train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(valid_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)


# Pad the sequences to a fixed length
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

In [45]:
np.savez(
    '../data/data.npz',
     train_sequences_padded=train_sequences_padded,
     train_labels=train_labels,
     val_sequences_padded=val_sequences_padded,
     val_labels=valid_labels,
     test_sequences_padded=test_sequences_padded,
     test_labels=test_labels,
     label_to_index=label_to_index,
     index_to_label=index_to_label
)

In [46]:
train_labels.shape

(320, 100, 81)

In [33]:
# Carregar o modelo BERT pré-treinado para token classification
# model = TFBertForTokenClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=len(label_list))

# # Compilar o modelo
# optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
# loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
# metrics = [tf.keras.metrics.CategoricalAccuracy()]

# model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# # Treinar o modelo
# model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# # Salvar o modelo treinado
# model.save_pretrained('./fine_tuned_ner_model')
# tokenizer.save_pretrained('./fine_tuned_ner_model')

In [34]:
INPUT_DIM = len(tokenizer.word_index)+1
EMBEDDING_DIM = 216
NUM_CLASSES = len(label_to_index)
MAX_LENGTH = train_sequences_padded.shape[1]
LSTM_UNITS = 64
DROPOUT = 0.2
BATCH_SIZE = 32
EPOCHS = 50

In [35]:
from keras import backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    _precision = true_positives / (predicted_positives + K.epsilon())
    return _precision

def recall(y_true, y_pred):
    """Compute recall metric"""
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

def f1_score(y_true, y_pred):
    """Compute f1-score metric"""
    _precision = precision(y_true, y_pred)
    _recall = recall(y_true, y_pred)
    f1_score = 2 * ((_precision * _recall) / (_precision + _recall + K.epsilon()))
    return f1_score

In [36]:
# Define the model architecture
model = tf.keras.models.Sequential([
    Embedding(INPUT_DIM, EMBEDDING_DIM, input_length=MAX_LENGTH),
    Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True)),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', precision, recall, f1_score])

model.summary()

2024-06-17 14:59:35.760789: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-17 14:59:35.761218: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-06-17 14:59:35.905986: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gra

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 216)          552096    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         143872    
 l)                                                              
                                                                 
 dense (Dense)               (None, 100, 79)           10191     
                                                                 
Total params: 706,159
Trainable params: 706,159
Non-trainable params: 0
_________________________________________________________________


2024-06-17 14:59:35.991175: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-06-17 14:59:35.991903: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-06-17 14:59:35.992620: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [37]:
# Train the model
history = model.fit(
    train_sequences_padded, 
    train_labels, 
    epochs=EPOCHS, 
    validation_data=(val_sequences_padded, valid_labels)
)

Epoch 1/50


2024-06-17 14:59:45.051456: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-06-17 14:59:45.052263: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-06-17 14:59:45.052814: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-06-17 14:59:46.571361: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-06-17 14:59:46.572202: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-06-17 14:59:46.572775: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [38]:
# Assuming tokenizer is your trained tokenizer
with open('../data/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)