# This notebook contains the currently working code for the baseline model. The final and ready-to-use version will be in src/baseline.py (at some point)

### Libraries

In [24]:
import torch 
from transformers import AutoTokenizer, AutoModelForTokenClassification, EarlyStoppingCallback, DataCollatorForTokenClassification
import os
import pandas as pd
import tensorflow as tf
from datasets import Dataset
import numpy as np
from seqeval.metrics import recall_score, precision_score
from functools import partial


### Loading the model and data

In [3]:
MODEL_NAME = 'bert-base-uncased'
TRAIN_HEAD_ONLY = True

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [6]:
data_path = os.path.join('..','data', 'raw', 'train.json')

data_pd = pd.read_json(data_path)

data = Dataset.from_pandas(data_pd)

### Labels

In [16]:
LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']

label2id = {label: i for i, label in enumerate(LABELS_LIST)}
label2id['[PAD]'] = -100
id2label = {i: label for label, i in label2id.items()}
id2label

{0: 'B-NAME_STUDENT',
 1: 'B-EMAIL',
 2: 'B-USERNAME',
 3: 'B-ID_NUM',
 4: 'B-PHONE_NUM',
 5: 'B-URL_PERSONAL',
 6: 'B-STREET_ADDRESS',
 7: 'I-NAME_STUDENT',
 8: 'I-EMAIL',
 9: 'I-USERNAME',
 10: 'I-ID_NUM',
 11: 'I-PHONE_NUM',
 12: 'I-URL_PERSONAL',
 13: 'I-STREET_ADDRESS',
 14: 'O',
 -100: '[PAD]'}

### Preprocessing

In [7]:
def tokenize_and_preserve_labels(examples):
    """
    To be used with batched = False
    Tokenizes the words using BERT and aligns the labels.
    """
    labels = []
    tokenized_sentence = []
    for word, label in zip(examples['tokens'], examples['labels']):
        #tokenizes the word using BERT's subword tokenizer
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        #adds the same label to all the subwords of the word
        labels.extend([label] * n_subwords)
    examples['tokens'] = tokenized_sentence
    examples['labels'] = labels
    return examples

In [8]:
aligned_data = data.map(tokenize_and_preserve_labels,batched=False)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [9]:
def chunk(examples, block_size=510, sliding_window=0):
    """
    To be used with batched = False
    Chunks the examples into blocks of size block_size, with overlap window of size sliding_window. 
    """
    tokenized_sentences = []
    labels = []
    for idx in range(0, len(examples['tokens']), block_size):
        #idx = max(0, i - sliding_window)
        chunk_token = examples['tokens'][idx:idx+block_size]
        chunk_label = examples['labels'][idx:idx+block_size]
        if len(chunk_token) < block_size:
            chunk_token += ['[PAD]'] * (block_size - len(chunk_token))
            chunk_label += ['[PAD]'] * (block_size - len(chunk_label))
        tokenized_sentences.append(chunk_token)
        labels.append(chunk_label)
    return {'tokens': tokenized_sentences, 'labels': labels}

In [10]:
chunked_data = aligned_data.map(chunk, batched=False)
chunked_data = chunked_data.remove_columns(['document', 'full_text','trailing_whitespace'])

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [11]:
from functools import reduce

flat_chunks_tokens = list(reduce(lambda x, y: x + y, chunked_data['tokens'], []))
flat_chunks_labels = list(reduce(lambda x, y: x + y, chunked_data['labels'], []))
chunked_flattened_data = Dataset.from_dict({'tokens': flat_chunks_tokens, 'labels': flat_chunks_labels})

In [12]:
def encode_tokens(example):
    detokenized = list(map(lambda x: ' '.join(x), example['tokens']))
    detokenized = list(map(lambda x: x.replace(' ##', ''), detokenized))
    encoded = tokenizer(detokenized, truncation=True, is_split_into_words = False, return_tensors='pt')
    return encoded

encoded_data = chunked_flattened_data.map(encode_tokens, batched=True)

Map:   0%|          | 0/12812 [00:00<?, ? examples/s]

In [20]:
def encode_labels(example):
    labels = example['labels']
    #adding -100 for the [CLS] token and [SEP] token
    encoded = [-100] + [label2id[label] for label in labels] + [-100]
    return {'labels': encoded}

encoded_labels = encoded_data.map(encode_labels, batched=False)

Map:   0%|          | 0/12812 [00:00<?, ? examples/s]

## Training the model

In [14]:
import evaluate

seqeval = evaluate.load("seqeval")

In [48]:


def compute_metrics(p):
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [LABELS_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABELS_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [32]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
#freezing the BERT layers
if TRAIN_HEAD_ONLY:
    for param in model.base_model.parameters():
        param.requires_grad = False

In [21]:
encoded_data_split = encoded_labels.train_test_split(test_size=0.2)

In [None]:
target_dir = "model/trainer_model_initial_preprocessing"

In [None]:
training_args = TrainingArguments(output_dir=target_dir, evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data_split["train"],
    eval_dataset=encoded_data_split["test"],
    #tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

##  Trying sth new

based on : https://www.kaggle.com/code/gdataranger/pii-data-detection-deberta-2048/notebook

In [38]:
class ModelTrainer:
    def __init__(self, model, tokenizer, device = 'mps'):
        self.device = device
        self.model = (model).to(self.device)
        self.tokenizer = tokenizer
        self.data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

    # Compute the model performance metrics
    def get_fbeta_score(self, precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

    def compute_metrics(self, p, all_labels):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        print(true_predictions)
        print(true_labels)

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = self.get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
        }
        return results

    def train(self, train_ds, all_labels, checkpoint_name='model_checkpoint'):
        """
        :param train_ds: tokenized_ds w/ ['train', 'test'] splits.
        """
        training_args = TrainingArguments(
            output_dir=checkpoint_name,
            fp16=True if self.device == 'cuda' else False,
            learning_rate=2e-5,
            #gradient_accumulation_steps=2,
            per_device_train_batch_size=8,  # OutOfMemoryError: CUDA out of memory.
            per_device_eval_batch_size=8,
            num_train_epochs=10,
            report_to="none",  # disable "wandb"
            # https://github.com/huggingface/transformers/issues/17971#issuecomment-1171579884
            use_mps_device=True if self.device == 'mps' else False, # MacOS M3 - 'mps'
            evaluation_strategy='epoch',
            eval_accumulation_steps=1, # equal to the number of GPU devices
            save_strategy='epoch',
            load_best_model_at_end=True,
            do_eval=True,
            save_total_limit=1,
            logging_steps=100,
            #lr_scheduler_type='cosine',
            metric_for_best_model='fbeta_score',
            #greater_is_better=True,
            warmup_ratio=0.1,
            weight_decay=0.01,
            push_to_hub=False, # TODO: Implement once working.
        )
        model_trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_ds['train'],
            eval_dataset=train_ds['test'],
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=partial(self.compute_metrics, all_labels=all_labels),
            callbacks=[
                EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.001)
            ],
            #device=self.device,
        )
        model_trainer.train()
        model_trainer.save_model(checkpoint_name)
        self.tokenizer.save_pretrained(checkpoint_name)

#model_trainer = ModelTrainer(model, tokenizer)

#model_trainer.train(encoded_data_split, id2label, checkpoint_name='model_checkpoint')

In [60]:
# load model

model = AutoModelForTokenClassification.from_pretrained('model/model_initial_preprocessing')

In [61]:
model_trainer = ModelTrainer(model, tokenizer)
c_m = model_trainer.compute_metrics

#test compute_metrics for examples with labels 0-14
#p contains 2 examples

model.to('cpu')
with torch.no_grad():
    p = model(torch.tensor(encoded_data['input_ids'][:2]), return_dict=True)
    p = (p.logits, torch.tensor(encoded_labels['labels'][:2]))
    r = c_m(p, LABELS_LIST)

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [65]:
trainer = Trainer(
    model=model,
    train_dataset=encoded_data_split["train"],
    eval_dataset=encoded_data_split["test"],
    #tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=partial(c_m, all_labels=LABELS_LIST),
)

#eval
trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/321 [00:00<?, ?it/s]

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

{'eval_loss': 0.00046115790610201657,
 'eval_recall': 0.9525222551928784,
 'eval_precision': 0.8991596638655462,
 'eval_fbeta_score': 0.9503529947620131,
 'eval_runtime': 104.0852,
 'eval_samples_per_second': 24.624,
 'eval_steps_per_second': 3.084}

## Using the model

### Loading the model

In [9]:
model_dir = 'model/model_initial_preprocessing'
model_loaded = AutoModelForTokenClassification.from_pretrained(model_dir)
model_loaded = model_loaded.to('cpu')

In [8]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained('zmilczarek/pii-detection-baseline-v0.1')

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

### Inference

In [None]:
"""
The plan to make the model label test.csv correctly

1. Load the model
2. Prepare the dataset  (prepare input ids/ att mask in chunks)
3. Get the labels

"""