In [44]:
import pandas as pd

## Loading the multinerd the English documents

In [None]:
from datasets import load_dataset
# loading the dataset
dataset = load_dataset("Babelscape/multinerd")
dataset = dataset.filter(lambda example: example['lang'] == 'en')

- Giving more memory spaces while the training by changing some env variables for Mac M1

In [118]:
import os
# Set the environment variable
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# Now, you can access the environment variable if needed
value = os.environ.get("PYTORCH_MPS_HIGH_WATERMARK_RATIO")
print("PYTORCH_MPS_HIGH_WATERMARK_RATIO:", value)


PYTORCH_MPS_HIGH_WATERMARK_RATIO: 0.0


In [75]:
import pandas as pd
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer 

# using the seqeval for our evaluating the models
metric = datasets.load_metric("seqeval")

def tokenize_and_align_labels(examples,tokenizer, label_all_tokens=True): 
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    
    return tokenized_inputs


def compute_metrics(eval_preds, label_list,only_overall):
    """
    Computes evaluation metrics for Named Entity Recognition (NER) predictions.

    Parameters:
    - eval_preds (tuple): Tuple containing predicted logits and true labels.
    - label_list (list): List of NER label names.
    - only_overall (bool): If True, returns only overall evaluation metrics.

    Returns:
    - dict: Dictionary containing evaluation metrics including precision, recall, F1 score, and accuracy.

    Note:
    - The function expects logits and labels to be in the same order.
    - It removes values where the label is -100 (padding tokens).
    - The label_list should correspond to the NER labels used in the model.
    - Evaluation metrics are computed using the `metric` object.
    - If only_overall is True, returns only overall evaluation metrics.

    """
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
    ] 
    results = metric.compute(predictions=predictions, references=true_labels)

    if only_overall == True:
        return { 
                "precision": results["overall_precision"], 
                "recall": results["overall_recall"], 
                "f1": results["overall_f1"], 
                "accuracy": results["overall_accuracy"], 
        }
    else: # returning the metrics include the overall and the individual performance for each NER token
        return results
        
def fine_tuning_model(dataset, tokenizer_type= "distilbert-base-cased",label_list=[], model_type="distilbert-base-cased",system_type='A', evaluate = False):
    """ 
    Fine-tunes a Token Classification model on the provided dataset using the specified tokenizer and model architecture.

    Parameters:
    - dataset (Dataset): A dataset containing train, validation, and test splits.
    - tokenizer_type (str): The type of tokenizer to be used. Defaults to "distilbert-base-cased".
    - label_list (list): List of NER label names.
    - model_type (str): The type of pre-trained model architecture to be used. Defaults to "distilbert-base-cased".
    - system_type (str): The system identifier (A or B) for saving checkpoints and models.
    - evaluate (bool): If True, loads a pre-trained model and evaluates it on the test dataset.

    Returns:
    - If evaluate is False:
        Tuple: (model, tokenizer, trainer, tokenized_datasets)
    - If evaluate is True:
        TrainerOutput: Results from model evaluation on the test dataset.

    Note:
    - The function uses the Hugging Face Transformers library.
    - If evaluate is False, it performs training, saves the trained model and tokenizer, and returns necessary objects.
    - If evaluate is True, it loads a pre-trained model, evaluates it on the test dataset, and returns evaluation results.

    """
    tokenizer = BertTokenizerFast.from_pretrained(tokenizer_type) 

    tokenized_datasets = dataset.map(lambda example: tokenize_and_align_labels(example, tokenizer), batched=True)
    
    data_collator = DataCollatorForTokenClassification(tokenizer)

    args = TrainingArguments( 
        f"models/{system_type}/{model_type}_checkpoints",
        evaluation_strategy = "epoch", 
        learning_rate=2e-5, 
        per_device_train_batch_size=10, 
        per_device_eval_batch_size=10, 
        num_train_epochs=2, 
        weight_decay=0.01, 
        )     

    if evaluate == False: 
        model = AutoModelForTokenClassification.from_pretrained(model_type, num_labels=len(label_list))
        trainer = Trainer( 
            model, 
            args, 
            train_dataset=tokenized_datasets['train'], 
            eval_dataset=tokenized_datasets['validation'], 
            data_collator=data_collator, 
            tokenizer=tokenizer, 
            compute_metrics= lambda eval_preds: compute_metrics(eval_preds, label_list, True),
            ) 
        print('Training the model')
        trainer.train()
        print('Saving the model')
        model.save_pretrained('models/' +system_type+'/'+ model_type + "_ner_model")
        print('Saving the tokenizer')
        tokenizer.save_pretrained('models/'+system_type+'/'+ model_type + "_ner_tokenizer")
        return model, tokenizer, trainer, tokenized_datasets
    else: 
        # Load the saved model
        loaded_model_a = AutoModelForTokenClassification.from_pretrained('models/' + system_type + '/' + model_type + "_ner_model")
        trainer_2 = Trainer( 
                loaded_model_a, 
                args, 
                data_collator=data_collator, 
                tokenizer=tokenizer, 
                compute_metrics= lambda eval_preds: compute_metrics(eval_preds, label_list,True),
        )
        print('making prediction on test dataset')
        results = trainer_2.predict(tokenized_datasets['test'])
        return results

Defines label-to-id and id-to-label mappings for Named Entity Recognition (NER) systems A and B.

- system_a_label2id: Mapping of NER labels to unique IDs for System A.
- system_a_id2label: Mapping of unique IDs to NER labels for System A.

- system_b_label2id: Mapping of selected NER labels to unique IDs for System B.
                     System B focuses on specific entity types.
- system_b_id2label: Mapping of unique IDs to NER labels for System B.


In [6]:
system_a_label2id = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }

system_a_id2label = {id: label for label, id in system_a_label2id.items()}
system_b_label2id = {label: id for label, id in system_a_label2id.items() if id <=8 or id in [13,14]}
system_b_label2id['B-DIS']=9
system_b_label2id['I-DIS']=10
system_b_id2label = {id: label for label, id in system_b_label2id.items()}

In [77]:
def convert_tags(tags):
      """
    Converts Named Entity Recognition (NER) tags in the specified format to a simplified version for System B.

    Parameters:
    - tags (list): List of NER tags represented as integers.

    Returns:
    - list: List of converted NER tags based on System B requirements.
    """ 
    for i in range(len(tags)):
        if tags[i]== 13: # corresponds to B-DIS
            tags[i] = 9
        elif tags[i] == 14: # corresponds to I-DIS
            tags[i] = 10
        elif tags[i]> 8:    # corresponts to all NER tags except (PERSON(PER), ORGANIZATION(ORG), LOCATION(LOC), DISEASES(DIS), ANIMAL(ANIM))
            tags[i] = 0

    return tags

dataset_a = dataset # for the system A we will take the entier dataset 

# Apply the convert_tags function to the 'ner_tags' column in your dataset
dataset_b = dataset_a.map(lambda example: {'ner_tags': convert_tags(example['ner_tags'])})

 19%|█▉        | 623/3291 [32:16<2:18:12,  3.11s/it]65.04 examples/s]
Map: 100%|██████████| 262560/262560 [00:07<00:00, 33955.21 examples/s]
Map: 100%|██████████| 32820/32820 [00:00<00:00, 36550.78 examples/s]
Map: 100%|██████████| 32908/32908 [00:00<00:00, 35031.17 examples/s]


Fine-tunes a Token Classification model on the provided dataset using the specified tokenizer and model architecture for System A.

Parameters:
- dataset_a (Dataset): A dataset containing train, validation, and test splits for System A.
- tokenizer_type (str): The type of tokenizer to be used. Defaults to "distilbert-base-cased".
- label_list (list): List of NER label names for System A.
- model_type (str): The type of pre-trained model architecture to be used. Defaults to "distilbert-base-cased".
- system_type (str): The system identifier (A or B) for saving checkpoints and models.

Returns:
- Tuple: (model_a, tokenizer_a, trainer_a, tokenized_datasets_a)


# TRAINING & EVALUATION

In [11]:
model_a, tokenizer_a, trainer_a, tokenized_datasets_a = fine_tuning_model(dataset_a, tokenizer_type= "distilbert-base-cased",label_list=system_a_id2label, model_type="distilbert-base-cased",system_type= 'A')

Fine-tunes a Token Classification model on the provided dataset using the specified tokenizer and model architecture for System B.

Parameters:
- dataset_b (Dataset): A dataset containing train, validation, and test splits for System B.
- tokenizer_type (str): The type of tokenizer to be used. Defaults to "distilbert-base-cased".
- label_list (list): List of NER label names for System B.
- model_type (str): The type of pre-trained model architecture to be used. Defaults to "distilbert-base-cased".
- system_type (str): The system identifier (A or B) for saving checkpoints and models.

Returns:
- Tuple: (model_b, tokenizer_b, trainer_b, tokenized_datasets_b)


In [None]:
model_b, tokenizer_b, trainer_b, tokenized_datasets_b = fine_tuning_model(dataset_b, tokenizer_type= "distilbert-base-cased",label_list=system_b_id2label, model_type="distilbert-base-cased",system_type= 'B')

## Evaluating System A

Evaluates a pre-trained Token Classification model on the test dataset for System A.

In [113]:
system_a_evaluation_res = fine_tuning_model(dataset_a, tokenizer_type= "distilbert-base-cased",label_list=system_a_id2label, model_type="distilbert-base-cased",system_type= 'A', evaluate = True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


making prediction on test dataset


100%|██████████| 3291/3291 [02:41<00:00, 20.34it/s]


In [114]:
# Assuming system_a_res is the result obtained from the fine-tuning_model function for System A.

# Extract predictions and true labels from the evaluation results
a_eval_preds = system_a_res.predictions, system_a_res.label_ids

# Define the label list for System A
label_list = list(system_a_label2id.keys())

# Compute detailed evaluation metrics for System A
system_a_eval_metrics = compute_metrics(a_eval_preds, system_a_id2label, only_overall=False)


In [115]:
# Precision, Recall, F1 for every ner tags 
pd.DataFrame(system_a_eval_metrics).T.iloc[:-4]

Unnamed: 0,precision,recall,f1,number
ANIM,0.725483,0.740347,0.732839,5076.0
BIO,0.549296,0.795918,0.65,98.0
CEL,0.878049,0.857143,0.86747,252.0
DIS,0.736238,0.768272,0.751914,8182.0
EVE,0.949861,0.947222,0.94854,720.0
FOOD,0.648944,0.624686,0.636584,7972.0
INST,0.706667,0.779412,0.741259,136.0
LOC,0.985702,0.99171,0.988697,24608.0
MEDIA,0.939123,0.941416,0.940268,2458.0
MYTH,0.838095,0.765217,0.8,230.0


In [116]:
# overall precision, recall, f1 for all ner tags
print('Overall_recall for system A:    ', system_a_eval_metrics['overall_recall'])
print('Overall_precision for system A: ', system_a_eval_metrics['overall_precision'])
print('Overall_f1 for system A:        ', system_a_eval_metrics['overall_f1'])


Overall_recall for system A:     0.8966653424625523
Overall_precision for system A:  0.8843078057661727
Overall_f1 for system A:         0.8904437018472106


## Evaluating the system B

Evaluates a pre-trained Token Classification model on the test dataset for System B.

In [103]:
system_b_evaluation_res = fine_tuning_model(dataset_b, tokenizer_type= "distilbert-base-cased",label_list=system_b_id2label, model_type="distilbert-base-cased",system_type= 'B', evaluate = True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.
Map:   0%|          | 0/32820 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 32820/32820 [00:02<00:00, 16134.49 examples/s]
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


making prediction on test dataset


100%|██████████| 3291/3291 [01:45<00:00, 31.30it/s]


In [106]:
# Assuming system_b_evaluation_res is the result obtained from the fine_tuning_model function for System B.

# Extract predictions and true labels from the evaluation results for System B
b_eval_preds = system_b_evaluation_res.predictions, system_b_evaluation_res.label_ids

# Define the label list for System B
label_list = list(system_b_label2id.keys())

""" 
Computes detailed evaluation metrics for Named Entity Recognition (NER) predictions on System B.

Parameters:
- b_eval_preds (tuple): Tuple containing predicted logits and true labels for System B.
- label_list (list): List of NER label names for System B.
- system_b_id2label (dict): Mapping of unique IDs to NER labels for System B.
- only_overall (bool): If False, returns detailed evaluation metrics for each NER label.

Returns:
- dict: Dictionary containing evaluation metrics for System B, including precision, recall, F1 score, and number of occurrences.
"""
# Compute detailed evaluation metrics for System B
system_b_eval_metrics = compute_metrics(b_eval_preds, system_b_id2label, only_overall=False)


In [108]:
pd.DataFrame(system_b_eval_metrics).T.iloc[:-4]

Unnamed: 0,precision,recall,f1,number
ANIM,0.731594,0.753208,0.742244,6702.0
DIS,0.767059,0.809335,0.78763,4028.0
LOC,0.992661,0.992509,0.992585,38978.0
ORG,0.971551,0.97174,0.971646,10262.0
PER,0.982165,0.988442,0.985293,14708.0


In [117]:
# overall precision, recall, f1 for all ner tags
print('Overall_recall for system B:    ', system_b_eval_metrics['overall_recall'])
print('Overall_precision for system B: ', system_b_eval_metrics['overall_precision'])
print('Overall_f1 for system B:        ', system_b_eval_metrics['overall_f1'])


Overall_recall for system B:     0.9574975226974477
Overall_precision for system B:  0.9510028195988721
Overall_f1 for system B:         0.9542391202807842
