In [None]:
# Install dependencies
!pip install datasets
!pip install spacy
!python -m spacy download pt_core_news_sm

### 1) Read and tokenize data

In [8]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict, load_dataset
import re
import spacy
import torch
import pandas as pd
from tqdm.notebook import tqdm

# Load the model and the tokenizer
model_name = "<model_name>"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def fix_tags_with_replace(text):
    """
    Fixes the format of tags in the text using replace for each possible case.

    Args:
        text (str): The text generated by the model with malformed tags.

    Returns:
        str: Text with corrected tags.
    """
    # List of tags that need to be fixed
    tags = [
        "AGE", "PHONE", "FAX", "EMAIL", "URL", "IP_ADDRESS", "DATE", "IDNUM",
        "MEDICAL_RECORD", "DEVICE", "HEALTH_PLAN", "BIOID", "STREET", "CITY",
        "ZIP", "STATE", "COUNTRY", "LOCATION_OTHER", "ORGANIZATION", "HOSPITAL",
        "PATIENT", "DOCTOR", "USERNAME", "PROFESSION", "OTHER", "LOCATION"
    ]

    for tag in tags:
        # Fix spaces around the opening tag
        text = text.replace(f"< {tag} >", f"<{tag}>").replace(f"< {tag}>", f"<{tag}>").replace(f"<{tag} >", f"<{tag}>")
        # Fix spaces around the closing tag
        text = text.replace(f"</ {tag} >", f"</{tag}/>").replace(f"</ {tag}>", f"</{tag}/>").replace(f"</{tag} >", f"</{tag}/>")
        # Fix malformed closings with extra slashes
        text = text.replace(f"<{tag}/> ", f"</{tag}/>").replace(f"<{tag}/ >", f"</{tag}/>").replace(f"</{tag}/ >", f"</{tag}/>")
        # Remove spaces between tags and the inner content
        text = text.replace(f"<{tag}> ", f"<{tag}>").replace(f" </{tag}>", f"</{tag}/>").replace(f"</{tag}>", f"</{tag}/>")

    return text

def sliding_window(text, window_size=200, overlap=50):
    """
    Function that splits a text into sliding windows of size `window_size` with an `overlap`.

    Args:
        text (str): The text to be processed.
        window_size (int): The maximum number of tokens per window.
        overlap (int): The number of overlapping tokens between windows.

    Returns:
        List of str: List of texts divided into sliding windows.
    """
    # Load the SpaCy model for Portuguese (or another language, if necessary)
    nlp = spacy.load('pt_core_news_sm')

    # Process the full text with SpaCy
    doc = nlp(text)

    # Extract tokens
    tokens = [token.text for token in doc]

    # List to store text windows
    windows = []

    # Sliding window implementation
    for i in range(0, len(tokens), window_size - overlap):
        # Capture a window of tokens
        window = tokens[i:i + window_size]
        windows.append(fix_tags_with_replace(" ".join(window)))

        # Stop if we are at the end of the text
        if i + window_size >= len(tokens):
            break

    return windows


def preprocess_function(examples):
    """
    Tokenizes the input and target texts to prepare them for model training.

    This function processes a batch of examples containing input texts and their
    corresponding target texts.

    Args:
        examples (dict): A dictionary containing two keys:
            - "input_text": str or List[str], the source texts.
            - "target_text": str or List[str], the target/label texts.

    Returns:
        dict: A dictionary with tokenized inputs and labels in the format expected
              by Hugging Face's Trainer API:
              - "input_ids": Token IDs for the input texts.
              - "attention_mask": Attention masks for the inputs.
              - "labels": Token IDs for the target texts.
    """
    inputs = examples["input_text"]  # Input text
    targets = examples["target_text"]  # Target text

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length"  # Ensures uniform length
    )

    # Tokenize targets (labels)
    with tokenizer.as_target_tokenizer():  # Ensures the tokenizer is in target mode
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length"
        )["input_ids"]  # Extract tokenized IDs

    model_inputs["labels"] = labels
    return model_inputs

In [None]:
preprocess_dataset = True

if preprocess_dataset:

    ### READ DATA
    dataset_ = load_dataset("Venturus/AnonyMED-BR")

    ### PRE-PROCESS DATA
    # Train
    list_chunks = [chunk for train_sample in tqdm(dataset_['train']) for chunk in sliding_window(train_sample["text"])]
    train_chunks = [{'input_text': re.sub(r"<[^>]+>", "", chunk), 'target_text': chunk} for chunk in list_chunks]

    with open('t5_train.json', 'w', encoding='utf-8') as f:
        json.dump(train_chunks, f, ensure_ascii=False, indent=4)

    # Eval
    list_chunks_eval = [chunk for eval_sample in tqdm(dataset_['validation']) for chunk in sliding_window(eval_sample["text"])]
    eval_chunks = [{'input_text': re.sub(r"<[^>]+>", "", chunk), 'target_text': chunk} for chunk in list_chunks_eval]

    with open('t5_eval.json', 'w', encoding='utf-8') as f:
        json.dump(eval_chunks, f, ensure_ascii=False, indent=4)

else:

    # Open and read the training set
    with open('t5_train.json', 'r') as file:
        train_chunks = json.load(file)

    # Open and read the evaluation set
    with open('t5_eval.json', 'r') as file:
        eval_chunks = json.load(file)

# Convert the data into Hugging Face Dataset objects
train_dataset = Dataset.from_list(train_chunks)
eval_dataset = Dataset.from_list(eval_chunks[0:500])

# Combine into a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": eval_dataset
})

# Apply tokenization
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Preview the data
print(dataset)

tokenized_datasets

### 2) Fine Tuning

In [None]:
# Configure training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="<path_to_folder>",  # Directory to save results
    report_to="none",
  #  evaluation_strategy="steps",   # Evaluate at the end of each epoch
    learning_rate=5e-5,             # Learning rate
    per_device_train_batch_size=4,  # Training batch size
    per_device_eval_batch_size=8,   # Evaluation batch size
   # weight_decay=0.01,             # Weight decay
    save_total_limit=3,             # Save only the last 3 checkpoints
    num_train_epochs=3,             # Number of training epochs
    predict_with_generate=True,     # Generate text during evaluation
    logging_dir='logs',             # Logs directory
    logging_steps=500,              # Log information every 500 steps
    fp16 = False,
    save_strategy="epoch",          # Save model at each epoch
    eval_steps=500,
    gradient_accumulation_steps=1
)

# Start training
trainer = Seq2SeqTrainer(
    model=model,                                    # Loaded model
    args=training_args,                             # Training configurations
    train_dataset=tokenized_datasets["train"],      # Training dataset
    eval_dataset=tokenized_datasets["validation"],  # Validation dataset
    tokenizer=tokenizer                             # Corresponding tokenizer
)

trainer.train()

# Save fine-tuned model
model.save_pretrained("T5_model")
tokenizer.save_pretrained("T5_model")
print("Model trained and saved successfully!")

### 3) Inference

In [None]:
def extract_tagged_words(text):
    """
    Extract labeled words from text based on XML-like tags.

    Args:
        text (str): The input text containing tags.

    Returns:
        list: A list of dictionaries with extracted words, their category, subcategory,
              and character positions in the original text.
    """
    dict_categories = {"AGE": "AGE", "PHONE": "CONTACT", "FAX": "CONTACT", "EMAIL": "CONTACT", "URL": "CONTACT",
                   "IP_ADDRESS": "CONTACT", "DATE": "DATE", "IDNUM": "ID", "MEDICAL_RECORD": "ID", "DEVICE": "ID",
                   "HEALTH_PLAN": "ID", "BIOID": "ID", "STREET": "LOCATION", "CITY": "LOCATION", "ZIP": "LOCATION",
                   "STATE": "LOCATION", "COUNTRY": "LOCATION", "LOCATION_OTHER": "LOCATION", "ORGANIZATION": "LOCATION",
                   "HOSPITAL": "LOCATION", "PATIENT": "NAME", "DOCTOR": "NAME", "USERNAME": "NAME", "PROFESSION": "PROFESSION",
                   "OTHER": "OTHER", "LOCATION": "LOCATION"}

    pattern = r"<(.*?)>(.*?)</\1/>"  # Regex to capture tags and content inside them
    matches = re.finditer(pattern, text)

    result = []
    for match in matches:
        tag = match.group(1)
        word = match.group(2)
        first_position = match.start(2)  # Start position of the extracted word
        last_position = match.end(2)     # End position of the extracted word
        category = dict_categories.get(tag, "UNKNOWN")  # Main category
        subcategory = tag

        result.append({
            "word": word,
            "category": category,
            "subcategory": subcategory,
            "first_position": first_position,
            "last_position": last_position
        })

    return result

def find_missing_words(predicted_words, labels):
    """
    Find words present in `labels` that are missing in `predicted_words`.

    Args:
        predicted_words (list): List of predicted words.
        labels (list): List of ground-truth words.

    Returns:
        tuple: Number of missing words and a list of those words.
    """
    missing_words = [word for word in labels if word not in predicted_words]
    return len(missing_words), missing_words

def calculate_f1_score(tp, fp, fn, verbose=False):
    """
    Calculate F1 Score, Recall, and Precision.

    Args:
        tp (int): True Positives.
        fp (int): False Positives.
        fn (int): False Negatives.
        verbose (bool): Whether to print detailed metrics.

    Returns:
        tuple: (f1_score, recall, precision)
    """
    # Precision = Correct predictions divided by total predicted positives
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0

    # Recall = Correct predictions divided by total actual positives
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    # F1 Score = Harmonic mean of precision and recall
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    if verbose:
        print('Recall:', recall)
        print('Precision:', precision)
        print('F1 Score:', f1_score, '\n')

    return f1_score, recall, precision

def eval(extractive_pred, dict_labels, verbose=False):
    """
    Evaluate extractive predictions against labeled data.

    Args:
        extractive_pred (dict): Dictionary containing predicted words and categories.
        dict_labels (dict): Dictionary of ground-truth labels for words.
        verbose (bool): Whether to print detailed evaluation results.

    Returns:
        tuple: F1, Recall, Precision (per word and overall)
    """
    TP, FP, FN = 0, 0, 0
    correct_predicted_words = []
    wrong_predicted_words = []
    predicted_words = []
    wrong_predicted_category = []
    for pred in extractive_pred['preds']:

        # predicted_words.append(pred['word'])
        if pred['word'] in list(dict_labels.keys()):

            if pred['subcategory'] == dict_labels[pred['word']]:
                TP += 1
                correct_predicted_words.append((pred['word'], pred['subcategory']))
                predicted_words.append(pred['word'])
            else:
                FP += 1
                wrong_predicted_category.append((pred['word'], pred['subcategory']))
        else:
            FP += 1
            wrong_predicted_words.append(pred['word'])

    # Calculate False Negatives
    FN, missing_words = find_missing_words(predicted_words, list(dict_labels.keys()))
    if verbose:
        print('Missing words:', missing_words)
        print('Correct Predicted words:', correct_predicted_words)
        print('Correct word but wrong category:', wrong_predicted_category)
        print('Wrong Predicted words:', wrong_predicted_words)
        print('Labels:', dict_labels)

    # Calculate F1 Score
    f1, recall, precision = calculate_f1_score(TP, FP, FN, verbose=verbose)

    return f1, recall, precision

def create_generative_format(text):
    """
    Convert text with tagged entities into a simplified generative format.

    This function replaces each complete tag (opening and closing with content)
    with only its opening tag, discarding the enclosed content.
    It is useful when preparing data for generative anonymization tasks
    where only the entity type needs to be indicated.

    Args:
        text (str): Input text containing XML-like tags with content.

    Returns:
        str: Text transformed into generative format with only opening tags.
    """
    # Regex to capture opening tag and content between tags
    pattern = r"<(.*?)>(.*?)</\1/>"

    # Replace function that keeps only the opening tag
    def replace_match(match):
        tag = match.group(1)
        # Return only the opening tag
        return f"<{tag}>"

    # Replace all matches in the text
    replaced_text = re.sub(pattern, replace_match, text)
    return replaced_text


def run_prediction(input_text, max_length=450):
    """
    Run model inference on input text using a sliding window.

    Args:
        input_text (str): The input text to be processed.
        max_length (int): Maximum output length.

    Returns:
        list: List of decoded model predictions.
    """
    # Tokenization
    inputs = tokenizer([window for window in windows], return_tensors="pt", padding=True)

    if device != 'cpu':
        # Move tensors to the correct device
        inputs = {key: value.to(model.device) for key, value in inputs.items()}

    # Batch Inference
    output_sequences = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        do_sample=False,
        max_length=max_length
    )
    return tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

### Load fine-tuned model
model_name = "T5_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move model to the device
if device != 'cpu':
    model = model.to(device)


In [None]:
# Get test data
test_set = dataset_['test']

list_f1 = []
list_recall = []
list_precision = []
list_pred_gen = []
list_syn = []
list_id = []
list_preds = []
for test_sample in tqdm(test_set):

    # Create the sliding window to input on the NLP model
    clean_text = re.sub(r"<[^>]+>", "", test_sample['text'])
    windows = sliding_window((clean_text), window_size=150, overlap=0)

    # Run batch inference on all windows and post-process data
    preds = run_prediction(windows, 512)

    final_pred = ' '.join(preds)
    fixed_final_pred = fix_tags_with_replace(final_pred).replace(' , ',', ').replace('</PHONE/>-<PHONE>','-').replace(' </PHONE/>','</PHONE/>').replace(' )</PHONE/>',')</PHONE/>')#.replace(' )',')').replace('( ','(')

    ### Save prediction in a list
    list_preds.append(fixed_final_pred)

    ### Extractive Format Evaluation ###
    tags = extract_tagged_words(fixed_final_pred)
    extractive_pred = {'id': test_sample['id'], 'preds': tags}

    # Get labels in the evaluation format
    dict_labels = {item['word']: item['subcategory'] for item in test_sample['labels']}

    # Run evaluation
    f1, recall, precision = eval(extractive_pred, dict_labels, verbose=False)

    # Save results for correct words and classes
    list_f1.append(f1)
    list_recall.append(recall)
    list_precision.append(precision)

    # Save if it is synthetic or not
    list_syn.append(test_sample['synthetic'])

    # Save example id
    list_id.append(test_sample['id'])

    ### Generative Format Evaluation ###
    list_pred_gen.append({'text': re.sub(r"<[^>]+>", "", test_sample['text']), 'masked_text':create_generative_format(test_sample['text']), 'prediction':create_generative_format(fixed_final_pred)})


### Create a dataframe with the evaluations
save_df = pd.DataFrame()
save_df['id'] = list_id
save_df['Recall'] = list_recall
save_df['Precision'] = list_precision
save_df['F1'] = list_f1
save_df['synthetic'] = list_syn
save_df['Prediction'] = list_preds
save_df.to_csv('T5_results.csv', index = False)

avg_f1 = sum(list_f1) / len(list_f1) if list_f1 else 0
avg_recall = sum(list_recall) / len(list_recall) if list_recall else 0
avg_precision = sum(list_precision) / len(list_precision) if list_precision else 0

print('Recall:', avg_recall)
print('Precision:', avg_precision)
print('F1:', avg_f1)

## Save predictions on the generative format
with open('t5_generative_predictions.json', 'w', encoding='utf-8') as f:
      json.dump(list_pred_gen, f, ensure_ascii=False, indent=4)

### Evaluation per entity

In [None]:
list_entities = ["PHONE", "AGE", "FAX", "EMAIL", "URL", "IP_ADDRESS", "DATE", "IDNUM",
        "MEDICAL_RECORD", "DEVICE", "HEALTH_PLAN", "BIOID", "STREET", "CITY",
        "ZIP", "STATE", "COUNTRY", "LOCATION_OTHER", "ORGANIZATION", "HOSPITAL",
        "PATIENT", "DOCTOR", "USERNAME", "PROFESSION", "OTHER", "LOCATION"]

def eval_entity(extractive_pred, dict_labels, verbose=False):
    """
    Evaluate the performance of extractive predictions against reference labels for entities.

    This function compares predicted words and their subcategories to a reference dictionary of labels.
    It calculates True Positives (TP), False Positives (FP), and False Negatives (FN), and returns
    the corresponding F1 score, recall, and precision. Optionally, it can print detailed information
    about correct and incorrect predictions.

    Args:
        extractive_pred (dict): Dictionary containing predicted entities under the key 'preds',
                                where each prediction is a dictionary with 'word' and 'subcategory'.
        dict_labels (dict): Dictionary of reference labels with words as keys and subcategories as values.
        verbose (bool, optional): If True, prints detailed information about missing words,
                                  correct predictions, and incorrect predictions. Defaults to False.

    Returns:
        tuple: A tuple containing:
            - f1 (float): F1 score for entity predictions.
            - recall (float): Recall score for entity predictions.
            - precision (float): Precision score for entity predictions.
    """
    TP, FP, FN = 0, 0, 0
    correct_predicted_words = []
    wrong_predicted_words = []
    predicted_words = []
    wrong_predicted_category = []
    for pred in extractive_pred['preds']:

      #  predicted_words.append(pred['word'])
        if pred['word'] in list(dict_labels.keys()):

            if pred['subcategory'] == dict_labels[pred['word']]:
                TP+=1
                correct_predicted_words.append((pred['word'], pred['subcategory']))
                predicted_words.append(pred['word'])
            else:
                FP+=1
                wrong_predicted_category.append((pred['word'], pred['subcategory']))
        else:
            FP+=1
            wrong_predicted_words.append(pred['word'])

    # Calculate False Negatives
    FN, missing_words = find_missing_words(predicted_words, list(dict_labels.keys()))
    if verbose:
        print('Missing words:', missing_words)
        print('Correct Predicted words:', correct_predicted_words)
        print('Correct word but wrong category:', wrong_predicted_category)
        print('Wrong Predicted words:', wrong_predicted_words)
        print('Labels:', dict_labels)

    # Calculate F1 Score
    f1, recall, precision = calculate_f1_score(TP, FP, FN, verbose=verbose)

    return f1, recall, precision

In [None]:
## Read predictions
dict_preds = pd.read_csv('T5_results.csv').to_dict('records')

test_set = dataset_['test']

list_f1_entity = []
list_recall_entity = []
list_precision_entity = []
list_entity = []
list_id = []
list_syn_entity = []
for test_sample, dict_pred in zip(test_set, dict_preds):

    assert test_sample['id'] == dict_pred['id']

    ### Extractive Format Evaluation ###
    tags = extract_tagged_words(dict_pred['Prediction'])
    extractive_pred = {'id': test_sample['id'], 'preds': tags}

    # Get labels in the evaluation format
    dict_labels = {item['word']: item['subcategory'] for item in test_sample['labels']}

    ## Evaluate performance per entity
    for entity in list_entities:
        ## Filter the entity to be evaluated inside the label
        filtered_dict_labels = {key: value for key, value in dict_labels.items() if value == entity}

        ## Filter the entity to be evaluated that were predicted by the model
        filtered_tags = [sample for sample in extractive_pred['preds'] if sample['subcategory'] == entity]

        filtered_extractive_pred = {'id': test_sample['id'], 'preds': filtered_tags}

        ## Check if the entity exists inside the label to run evaluation
        if len(filtered_dict_labels) > 0:
            f1, recall, precision = eval_entity(filtered_extractive_pred, filtered_dict_labels, verbose=False)

            list_f1_entity.append(f1)
            list_recall_entity.append(recall)
            list_precision_entity.append(precision)
            list_entity.append(entity)
            list_id.append(test_sample['id'])

            # Save if it is synthetic or not
            list_syn_entity.append(test_sample['synthetic'])

### Create a dataframe with the evaluations
entity_save_df = pd.DataFrame()
entity_save_df['id'] = list_id
entity_save_df['Entity'] = list_entity
entity_save_df['Recall'] = list_recall_entity
entity_save_df['Precision'] = list_precision_entity
entity_save_df['F1'] = list_f1_entity
entity_save_df['synthetic'] = list_syn_entity

entity_save_df.to_csv('T5_entity_results.csv')

In [None]:
# Generate results grouped per Entity
df_grouped_entity = entity_save_df.groupby('Entity')[['Precision', 'Recall', 'F1']].mean().reset_index()
df_grouped_entity.to_csv('T5_entity_final_results.csv', index=False)
print(df_grouped_entity)

In [None]:
# Generate results grouped per Entity but separated between real and synthetic samples
df_grouped_syn_entity = entity_save_df.groupby(['synthetic', 'Entity'])[['Precision', 'Recall', 'F1']].mean().reset_index()
df_grouped_syn_entity.to_csv('T5_entity_final_grouped_results.csv', index=False)
print(df_grouped_syn_entity)

In [None]:
# Generate F1 scores grouped by real and synthetic samples
df_results_ = pd.read_csv('T5_results.csv')
df_results_grouped = df_results_.groupby(['synthetic'])[['Precision', 'Recall', 'F1']].mean().reset_index()

df_results_grouped.to_csv('T5_grouped_results.csv', index= False)