https://towardsdatascience.com/custom-named-entity-recognition-with-bert-cf1fd4510804

## Run Separately

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from tqdm import tqdm

from transformers import AutoTokenizer
from transformers import BertForTokenClassification
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Model

In [3]:
class BioBertNER(nn.Module):
  """
  Implement NN class based on distilbert pretrained from Hugging face.
  Inputs :
    tokens_dim : int specifyng the dimension of the classifier
  """

  def __init__(self, tokens_dim):
    super(BioBertNER,self).__init__()

    if type(tokens_dim) != int:
            raise TypeError('Please tokens_dim should be an integer')

    if tokens_dim <= 0:
          raise ValueError('Classification layer dimension should be at least 1')

    self.pretrained = BertForTokenClassification.from_pretrained("pucpr/biobertpt-all", num_labels = tokens_dim) #set the output of each token classifier = unique_lables


  def forward(self, input_ids, attention_mask, labels = None): #labels are needed in order to compute the loss
    """
  Forwad computation of the network
  Input:
    - inputs_ids : from model tokenizer
    - attention :  mask from model tokenizer
    - labels : if given the model is able to return the loss value
  """

    #inference time no labels
    if labels == None:
      out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask )
      return out

    out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask , labels = labels)
    return out

# Dataset

In [4]:
class NerDataset(torch.utils.data.Dataset):
  """
  Custom dataset implementation to get (text,labels) tuples
  Inputs:
   - df : dataframe with columns [tags, sentence]
  """

  def __init__(self, df):
    if not isinstance(df, pd.DataFrame):
      raise TypeError('Input should be a dataframe')

    if "tags" not in df.columns or "sentence" not in df.columns:
      raise ValueError("Dataframe should contain 'tags' and 'sentence' columns")

    tags_list = [i.split() for i in df["tags"].values.tolist()]
    texts = df["sentence"].values.tolist()

    self.texts = [tokenizer(text, padding = "max_length", max_length = 512, truncation = True, return_tensors = "pt") for text in texts]
    self.labels = [match_tokens_labels(text, tags) for text,tags in zip(self.texts, tags_list)]

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    batch_text = self.texts[idx]
    batch_labels = self.labels[idx]

    return batch_text, torch.LongTensor(batch_labels)

# Metrics

In [5]:
class MetricsTracking():
  """
  In order make the train loop lighter I define this class to track all the metrics that we are going to measure for our model.
  """
  def __init__(self):

    self.total_acc = 0
    self.total_f1 = 0
    self.total_precision = 0
    self.total_recall = 0

  def update(self, predictions, labels , ignore_token = -100):
    '''
    Call this function every time you need to update your metrics.
    Where in the train there was a -100, were additional token that we dont want to label, so remove them.
    If we flatten the batch its easier to access the indexed = -100
    '''
    predictions = predictions.flatten()
    labels = labels.flatten()

    predictions = predictions[labels != ignore_token]
    labels = labels[labels != ignore_token]

    predictions = predictions.to("cpu")
    labels = labels.to("cpu")

    #print(labels)
    #print(predictions)
    #print()  
    #print("Tamanho dos labels", len(labels))
    #print("Tamano dos predictions", len(predictions))
    #print()
    
    acc = accuracy_score(labels,predictions)
    f1 = f1_score(labels, predictions, average = "macro")
    precision = precision_score(labels, predictions, average = "macro")
    recall = recall_score(labels, predictions, average = "macro")

    self.total_acc  += acc
    self.total_f1 += f1
    self.total_precision += precision
    self.total_recall  += recall

  def return_avg_metrics(self,data_loader_size):
    n = data_loader_size
    metrics = {
        "acc": round(self.total_acc / n ,3),
        "f1": round(self.total_f1 / n, 3),
        "precision" : round(self.total_precision / n, 3),
        "recall": round(self.total_recall / n, 3)
          }
    return metrics

# Utils

In [6]:
def tags_2_labels(tags : str, tag2idx : dict):
  '''
  Method that takes a list of tags and a dictionary mapping and returns a list of labels (associated).
  Used to create the "label" column in df from the "tags" column.
  '''
  return [tag2idx[tag] if tag in tag2idx else unseen_label for tag in tags.split()]

def tags_mapping(tags_series : pd.Series):
  """
  tag_series = df column with tags for each sentence.
  Returns:
    - dictionary mapping tags to indexes (label)
    - dictionary mappign inedexes to tags
    - The label corresponding to tag 'O'
    - A set of unique tags ecountered in the trainind df, this will define the classifier dimension
  """

  if not isinstance(tags_series, pd.Series):
      raise TypeError('Input should be a padas Series')

  unique_tags = set()

  for tag_list in df_train["tags"]:
    for tag in tag_list.split():
      unique_tags.add(tag)


  tag2idx = {k:v for v,k in enumerate(sorted(unique_tags))}
  idx2tag = {k:v for v,k in tag2idx.items()}

  unseen_label = tag2idx["O"]

  return tag2idx, idx2tag, unseen_label, unique_tags

def match_tokens_labels(tokenized_input, tags, ignore_token = -100):
  '''
  Used in the custom dataset.
  -100 will be tha label used to match additional tokens like [CLS] [PAD] that we dont care about.
  Inputs :
    - tokenized_input : tokenizer over the imput text -> {input_ids, attention_mask}
    - tags : is a single label array -> [O O O O O O O O O O O O O O B-tim O]

  Returns a list of labels that match the tokenized text -> [-100, 3,5,6,-100,...]
  '''

  #gives an array [ None , 0 , 1 ,2 ,... None]. Each index tells the word of reference of the token
  word_ids = tokenized_input.word_ids()

  previous_word_idx = None
  label_ids = []

  for word_idx in word_ids:

      if word_idx is None:
          label_ids.append(ignore_token)

      #if its equal to the previous word we can add the same label id of the provious or -100
      else :
          try:
            reference_tag = tags[word_idx]
            label_ids.append(tag2idx[reference_tag])
          except:
            label_ids.append(ignore_token)

      previous_word_idx = word_idx

  return label_ids

def get_labels_unique_word(tokenized_input, predictions):
    word_ids = tokenized_input.word_ids()
    
    previous_word_idx = -1
    unique_tags_pred = []

    for word_idx in word_ids:
        if word_idx is None or (word_idx == previous_word_idx):
            continue

        else:
            idx_in_word_ids_array = word_ids.index(word_idx)
            reference_tag = predictions[idx_in_word_ids_array]
            unique_tags_pred.append(idx2tag[reference_tag])

        previous_word_idx = word_idx
  
    return unique_tags_pred

# Train

In [7]:
def train_loop_wt_eval(model, train_dataset, optimizer,  batch_size, epochs):

  train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)

  for epoch in range(epochs) :

    train_metrics = MetricsTracking()
    total_loss_train = 0

    model.train() #train mode

    for train_data, train_label in tqdm(train_dataloader):
      train_label = train_label.to(device)
      '''
      squeeze in order to match the sizes. From [batch,1,seq_len] --> [batch,seq_len]
      '''
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad()

      output = model(input_id, mask, train_label)
      loss, logits = output.loss, output.logits
      predictions = logits.argmax(dim= -1)

      #compute metrics
      train_metrics.update(predictions, train_label)
      total_loss_train += loss.item()

      #grad step
      loss.backward()
      optimizer.step()

    train_results = train_metrics.return_avg_metrics(len(train_dataloader))

    print(f"TRAIN \nLoss: {total_loss_train / len(train_dataset)} \nMetrics {train_results}\n" )

In [8]:
def train_loop(model, train_dataset, dev_dataset, optimizer,  batch_size, epochs):

  train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
  dev_dataloader = DataLoader(dev_dataset, batch_size = batch_size, shuffle = True)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)

  for epoch in range(epochs) :

    train_metrics = MetricsTracking()
    total_loss_train = 0

    model.train() #train mode

    for train_data, train_label in tqdm(train_dataloader):
      train_label = train_label.to(device)
      '''
      squeeze in order to match the sizes. From [batch,1,seq_len] --> [batch,seq_len]
      '''
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad()

      output = model(input_id, mask, train_label)
      loss, logits = output.loss, output.logits
      predictions = logits.argmax(dim= -1)

      #compute metrics
      train_metrics.update(predictions, train_label)
      total_loss_train += loss.item()

      #grad step
      loss.backward()
      optimizer.step()

    '''
    EVALUATION MODE
    '''
    model.eval()

    dev_metrics = MetricsTracking()
    total_loss_dev = 0

    with torch.no_grad():
      for dev_data, dev_label in dev_dataloader:

        dev_label = dev_label.to(device)

        mask = dev_data['attention_mask'].squeeze(1).to(device)
        input_id = dev_data['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask, dev_label)
        loss, logits = output.loss, output.logits

        
        predictions = logits.argmax(dim= -1)

        dev_metrics.update(predictions, dev_label)
        total_loss_dev += loss.item()

    train_results = train_metrics.return_avg_metrics(len(train_dataloader))
    dev_results = dev_metrics.return_avg_metrics(len(dev_dataloader))

    print(f"TRAIN \nLoss: {total_loss_train / len(train_dataset)} \nMetrics {train_results}\n" )
    print(f"VALIDATION \nLoss {total_loss_dev / len(dev_dataset)} \nMetrics{dev_results}\n" )

# Eval

In [9]:
def get_complete_tokens_and_labels_and_labels_pred(tokens, dev_labels, pred_labels):
    """
    Retorna uma lista de tokens completos (sem prefixos) e suas labels correspondentes,
    ignorando os tokens especiais [CLS], [SEP], [PAD].

    Args:
        tokens (List[str]): Lista de tokens (incluindo subpalavras com ##).
        dev_labels (List[int]): Lista de labels correspondentes aos tokens.
        pred_labels (List[int]): Lista de labels preditos correspondentes aos tokens.

    Returns:
        List[str]: Lista de tokens completos.
        List[int]: Lista de labels correspondentes aos tokens completos.
        List[int]: Lista de labels preditos correspondentes aos tokens completos.
    """
    
    complete_tokens = []
    complete_dev_labels = []
    complete_pred_labels = []
    current_token = ""
    current_dev_label = None
    current_pred_label = None

    for token, dev_label, pred_label in zip(tokens, dev_labels, pred_labels):
        # Ignora tokens especiais
        if token in ['[CLS]', '[SEP]', '[PAD]']:
            continue

        # Verifica se é uma subpalavra
        if token.startswith("##"):
            current_token += token[2:]  # Adiciona a subpalavra ao token atual
        else:
            # Se existia um token atual, armazena-o
            if current_token:
                complete_tokens.append(current_token)
                complete_dev_labels.append(current_dev_label)
                complete_pred_labels.append(current_pred_label)

            # Inicia um novo token
            current_token = token
            current_dev_label = dev_label
            current_pred_label = pred_label

    # Adiciona o último token, se existir
    if current_token:
        complete_tokens.append(current_token)
        complete_dev_labels.append(current_dev_label)
        complete_pred_labels.append(current_pred_label)

    return complete_tokens, complete_dev_labels, complete_pred_labels

In [10]:
def adjust_labels(labels):
    """Ajusta os rótulos substituindo -100 por 12."""
    return [12 if label == -100 else label for label in labels]

In [11]:
def evaluate_test_texts(model, df_test, batch_size = 1):

    dev_dataset = NerDataset(df_test)
    dev_dataloader = DataLoader(dev_dataset, batch_size = batch_size, shuffle = False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    dev_metrics = MetricsTracking()
    total_loss_dev = 0

    test_text, test_labels, test_labels_pred = [], [], []

    with torch.no_grad():
        for dev_data, dev_label in dev_dataloader:

            dev_label = dev_label.to(device)
            list_labels = dev_label.squeeze(0).tolist()
            list_labels = adjust_labels(list_labels)

            mask = dev_data['attention_mask'].squeeze(1).to(device)
            input_id = dev_data['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask, dev_label)
            loss, logits = output.loss, output.logits
            predictions = logits.argmax(dim= -1)
            
            list_text_tokenized = tokenizer.convert_ids_to_tokens(input_id.squeeze())
            list_labels_pred = torch.argmax(logits, dim=-1).squeeze().tolist()
            
            
            str_complete_tokens, int_complete_labels, int_complete_labels_pred = get_complete_tokens_and_labels_and_labels_pred(list_text_tokenized, 
                                                                                                                                list_labels, 
                                                                                                                                list_labels_pred)
            str_complete_labels = [idx2tag[int_label] for int_label in int_complete_labels]
            str_complete_labels_pred = [idx2tag[int_label] for int_label in int_complete_labels_pred]

            print("Tamanho lista de tokens", len(str_complete_tokens))
            test_text.append(str_complete_tokens)
            test_labels.append(str_complete_labels)
            test_labels_pred.append(str_complete_labels_pred)

            dev_metrics.update(predictions, dev_label)
            total_loss_dev += loss.item()

            
    dev_results = dev_metrics.return_avg_metrics(len(dev_dataloader))

    print(f"VALIDATION \nLoss {total_loss_dev / len(dev_dataset)} \nMetrics{dev_results}\n" )

    return test_text, test_labels, test_labels_pred

# Run

In [12]:
file_path = 'data/df_all_resports_full_sentence.csv'

df_total = pd.read_csv(file_path, encoding='utf-8')
df_total.rename(columns = {'text':'sentence', 'iob_labels':'tags'}, inplace = True)

train_file_path = 'data/df_train_llms_full_sentences_divide_by_four.csv'
test_file_path = 'data/df_test_llms_full_sentences_divide_by_four.csv'

df_train = pd.read_csv(train_file_path, encoding='utf-8')
df_test = pd.read_csv(test_file_path, encoding='utf-8')

df_train.rename(columns = {'text':'sentence', 'iob_labels':'tags'}, inplace = True)
df_test.rename(columns = {'text':'sentence', 'iob_labels':'tags'}, inplace = True)

In [13]:
#create tag-label mapping
#tag2idx, idx2tag , unseen_label, unique_tags = tags_mapping(df_train["tags"])
tag2idx, idx2tag , unseen_label, unique_tags = tags_mapping(df_total["tags"])

#create the label column from tag. Unseen labels will be tagged as "O"
for df in [df_train, df_test]:
  df["labels"] = df["tags"].apply(lambda tags : tags_2_labels(tags, tag2idx))

#original text
#text = df_train["sentence"].values.tolist()
text = df_total["sentence"].values.tolist()

#toeknized text
tokenizer = AutoTokenizer.from_pretrained("pucpr/biobertpt-all", do_lower_case=False)
text_tokenized = tokenizer(text , padding = "max_length", max_length = 512, truncation = True, return_tensors = "pt" )

#mapping token to original word
word_ids = text_tokenized.word_ids()

#datasets
train_dataset = NerDataset(df_train)
#dev_dataset = NerDataset(df_dev)

learning_rate = 1e-3
epochs = 15
batch_size = 8

In [14]:
model = BioBertNER(len(unique_tags))

optimizer = SGD(model.parameters(), lr=learning_rate, momentum = 0.9)

parameters = {
    "model": model,
    "train_dataset": train_dataset,
    "optimizer" : optimizer,
    "epochs" : epochs,
    "batch_size" : batch_size
}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at pucpr/biobertpt-all and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
train_loop_wt_eval(**parameters)
torch.save(model.state_dict(), 'models/model.pth')

100%|██████████| 431/431 [03:29<00:00,  2.06it/s]


TRAIN 
Loss: 0.023168767750067522 
Metrics {'acc': 0.958, 'f1': 0.364, 'precision': 0.375, 'recall': 0.366}



100%|██████████| 431/431 [03:29<00:00,  2.05it/s]


TRAIN 
Loss: 0.00930831246231299 
Metrics {'acc': 0.98, 'f1': 0.583, 'precision': 0.592, 'recall': 0.602}



100%|██████████| 431/431 [03:29<00:00,  2.06it/s]


TRAIN 
Loss: 0.0061286779372623215 
Metrics {'acc': 0.986, 'f1': 0.7, 'precision': 0.709, 'recall': 0.717}



100%|██████████| 431/431 [03:29<00:00,  2.06it/s]


TRAIN 
Loss: 0.004541048573128243 
Metrics {'acc': 0.989, 'f1': 0.778, 'precision': 0.787, 'recall': 0.791}



100%|██████████| 431/431 [03:29<00:00,  2.06it/s]


TRAIN 
Loss: 0.0036884666512637873 
Metrics {'acc': 0.991, 'f1': 0.818, 'precision': 0.828, 'recall': 0.83}



100%|██████████| 431/431 [03:29<00:00,  2.06it/s]


TRAIN 
Loss: 0.0030951301456707474 
Metrics {'acc': 0.993, 'f1': 0.853, 'precision': 0.862, 'recall': 0.861}



100%|██████████| 431/431 [03:29<00:00,  2.06it/s]


TRAIN 
Loss: 0.002675920743459606 
Metrics {'acc': 0.994, 'f1': 0.87, 'precision': 0.877, 'recall': 0.879}



100%|██████████| 431/431 [03:32<00:00,  2.02it/s]


TRAIN 
Loss: 0.0023621580982473582 
Metrics {'acc': 0.994, 'f1': 0.891, 'precision': 0.899, 'recall': 0.895}



100%|██████████| 431/431 [03:34<00:00,  2.01it/s]


TRAIN 
Loss: 0.0018679091966071646 
Metrics {'acc': 0.996, 'f1': 0.9, 'precision': 0.907, 'recall': 0.904}



100%|██████████| 431/431 [03:30<00:00,  2.05it/s]


TRAIN 
Loss: 0.0018243165506927276 
Metrics {'acc': 0.996, 'f1': 0.914, 'precision': 0.918, 'recall': 0.919}



100%|██████████| 431/431 [03:29<00:00,  2.05it/s]


TRAIN 
Loss: 0.0014202293909653582 
Metrics {'acc': 0.997, 'f1': 0.919, 'precision': 0.924, 'recall': 0.923}



100%|██████████| 431/431 [03:31<00:00,  2.04it/s]


TRAIN 
Loss: 0.0012239183862513988 
Metrics {'acc': 0.998, 'f1': 0.935, 'precision': 0.938, 'recall': 0.938}



100%|██████████| 431/431 [03:33<00:00,  2.02it/s]


TRAIN 
Loss: 0.0013099060284211404 
Metrics {'acc': 0.997, 'f1': 0.937, 'precision': 0.941, 'recall': 0.94}



100%|██████████| 431/431 [03:31<00:00,  2.04it/s]


TRAIN 
Loss: 0.001056238156478539 
Metrics {'acc': 0.998, 'f1': 0.952, 'precision': 0.955, 'recall': 0.955}



100%|██████████| 431/431 [03:31<00:00,  2.04it/s]


TRAIN 
Loss: 0.001011479112201186 
Metrics {'acc': 0.998, 'f1': 0.947, 'precision': 0.949, 'recall': 0.95}



In [16]:
sentences, dev_labels, pred_labels = evaluate_test_texts(model, df_test)

Tamanho lista de tokens 39
Tamanho lista de tokens 39
Tamanho lista de tokens 39
Tamanho lista de tokens 39
Tamanho lista de tokens 36
Tamanho lista de tokens 36
Tamanho lista de tokens 36
Tamanho lista de tokens 36
Tamanho lista de tokens 35
Tamanho lista de tokens 35
Tamanho lista de tokens 35
Tamanho lista de tokens 35
Tamanho lista de tokens 54
Tamanho lista de tokens 54
Tamanho lista de tokens 54
Tamanho lista de tokens 54
Tamanho lista de tokens 37
Tamanho lista de tokens 37
Tamanho lista de tokens 37
Tamanho lista de tokens 37
Tamanho lista de tokens 53
Tamanho lista de tokens 53
Tamanho lista de tokens 53
Tamanho lista de tokens 53
Tamanho lista de tokens 56
Tamanho lista de tokens 56
Tamanho lista de tokens 56
Tamanho lista de tokens 56
Tamanho lista de tokens 35
Tamanho lista de tokens 35
Tamanho lista de tokens 35
Tamanho lista de tokens 35
Tamanho lista de tokens 38
Tamanho lista de tokens 38
Tamanho lista de tokens 38
Tamanho lista de tokens 38
Tamanho lista de tokens 51
T

In [17]:
result = classification_report(dev_labels, pred_labels, mode="strict", scheme=IOB2, zero_division=False)
print(result)

              precision    recall  f1-score   support

         ACH       0.92      0.96      0.94       101
         ATE       0.70      0.89      0.78        18
         BOR       0.38      0.43      0.40        14
         CAL       0.88      0.96      0.92        82
         LOC       0.79      0.90      0.84        99
         TAM       0.81      0.88      0.84        99

   micro avg       0.82      0.91      0.86       413
   macro avg       0.74      0.84      0.79       413
weighted avg       0.82      0.91      0.86       413



# Save Results

In [18]:
report_index_from_df = df_test['report'].to_list()
dict = {"Report" : report_index_from_df, "Token" : sentences, "IOB_label" : dev_labels, "IOB_label_pred" : pred_labels}

In [19]:
df_with_results = pd.DataFrame(dict)
df_with_results.to_csv('results.csv', encoding='utf-8', index=False)

# Load Models

In [None]:
dir_path = r"path"
model_loaded = BioBertNER(tokens_dim=len(unique_tags))
model_loaded.load_state_dict(torch.load(dir_path))