### Libraries

In [36]:
import numpy as np
import os
import pandas as pd

from datasets import Dataset
from functools import reduce
from seqeval.metrics import recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from functools import partial

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split

current versions:
- zmilczarek/pii-detection-roberta-v3
- zmilczarek/pii-detection-roberta-v2
- zeinab-sheikhi/Roberta-pii-detection-baseline (not trained on the unified data split)

Roberta
 The inputs of the model take pieces of 512 contiguous tokens that may span over documents

## Baseline roberta model

The training below produced roberta-v3

In [37]:
class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'roberta-base'
    train_head_only = False

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'roberta-base-experiment')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch"
        )
    model_save_path = os.path.join(target_dir, 'model')

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, add_prefix_space=True)

### Data

In [9]:
keys_to_flatten = ['labels', 'input_ids', 'attention_mask', 'org_word_ids','document']
data_path = os.path.join('..','data', 'raw', 'train.json')
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id, keys_to_flatten=keys_to_flatten)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:06<00:00,  1.11s/it]


In [22]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

### Metrics

In [23]:
def get_fbeta_score(precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

def compute_metrics(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        preds_set = np.unique(predictions)

        for pred in preds_set:
            assert pred in list(range(len(labels_list))) and pred !=-100, f"Predicted label {pred} is not in the labels list \npreds_set: {preds_set}"
        # Remove ignored index (special tokens)
        
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
            }
        
        return results

### Training the model

In [24]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [26]:
trainer.train(resume_from_checkpoint=True)

  0%|          | 0/4290 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0008855126798152924, 'eval_recall': 0.9208860759493671, 'eval_precision': 0.7385786802030457, 'eval_fbeta_score': 0.9122257053291535, 'eval_runtime': 51.961, 'eval_samples_per_second': 24.461, 'eval_steps_per_second': 3.06, 'epoch': 3.0}
{'train_runtime': 369.1821, 'train_samples_per_second': 92.954, 'train_steps_per_second': 11.62, 'train_loss': 2.4982217030647474e-05, 'epoch': 3.0}


TrainOutput(global_step=4290, training_loss=2.4982217030647474e-05, metrics={'train_runtime': 369.1821, 'train_samples_per_second': 92.954, 'train_steps_per_second': 11.62, 'train_loss': 2.4982217030647474e-05, 'epoch': 3.0})

In [28]:
trainer.evaluate(data_test, metric_key_prefix='test')

  0%|          | 0/177 [00:00<?, ?it/s]

{'test_loss': 0.0009286311105825007,
 'test_recall': 0.9132653061224489,
 'test_precision': 0.7991071428571429,
 'test_fbeta_score': 0.9082747853239658,
 'test_runtime': 55.9538,
 'test_samples_per_second': 25.253,
 'test_steps_per_second': 3.163,
 'epoch': 3.0}

In [27]:
trainer.save_model(CFG.model_save_path)

All the above is saved in the 'experiment' folder

In [42]:
# model_to_push = AutoModelForTokenClassification.from_pretrained(CFG.model_save_path)
# token = 'token_here'
# model_to_push.push_to_hub('zmilczarek/pii-detection-roberta-v3', token=token)

## Using the model

In [41]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained('zmilczarek/pii-detection-roberta-v3')

config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [8]:
trainer_from_huggingface = Trainer(
    model=model_from_huggingface,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer_from_huggingface.evaluate(data_test)

  0%|          | 0/177 [00:00<?, ?it/s]

{'eval_loss': 0.0008325826493091881,
 'eval_recall': 0.8086734693877551,
 'eval_precision': 0.8386243386243386,
 'eval_fbeta_score': 0.8097858125368441,
 'eval_runtime': 56.0802,
 'eval_samples_per_second': 25.196,
 'eval_steps_per_second': 3.156}

## Roberta - finetuning just the HEAD

In [29]:
class CFG_just_head:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'roberta-base'
    train_head_only = True

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'roberta-base')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer_just_head'), 
        evaluation_strategy="epoch",
        num_train_epochs=10,
        learning_rate=1e-5)
    model_save_path = os.path.join(target_dir, 'model_just_head')
    

In [30]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG_just_head.model_name, num_labels=len(CFG_just_head.id2label), id2label=CFG_just_head.id2label, label2id=CFG_just_head.label2id
)
for param in model.base_model.parameters():
        param.requires_grad = False
print('\nTraining head only')

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training head only


In [32]:
trainer_head_only = Trainer(
    model=model,
    args=CFG_just_head.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    compute_metrics=partial(compute_metrics, labels_list=CFG_just_head.LABELS_LIST),
)

current problem : compute metrics crashes and the model wants to train for 38hrs

In [33]:
trainer_head_only.train()

  0%|          | 0/14300 [00:00<?, ?it/s]

{'loss': 2.1446, 'learning_rate': 9.650349650349651e-06, 'epoch': 0.35}
{'loss': 1.5924, 'learning_rate': 9.300699300699301e-06, 'epoch': 0.7}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 1.0211737155914307, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 48.0388, 'eval_samples_per_second': 26.458, 'eval_steps_per_second': 3.31, 'epoch': 1.0}
{'loss': 1.1538, 'learning_rate': 8.951048951048951e-06, 'epoch': 1.05}
{'loss': 0.8266, 'learning_rate': 8.601398601398602e-06, 'epoch': 1.4}
{'loss': 0.5889, 'learning_rate': 8.251748251748254e-06, 'epoch': 1.75}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.36827197670936584, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 47.6737, 'eval_samples_per_second': 26.66, 'eval_steps_per_second': 3.335, 'epoch': 2.0}
{'loss': 0.4204, 'learning_rate': 7.902097902097902e-06, 'epoch': 2.1}
{'loss': 0.3039, 'learning_rate': 7.552447552447552e-06, 'epoch': 2.45}
{'loss': 0.2163, 'learning_rate': 7.202797202797203e-06, 'epoch': 2.8}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.13764449954032898, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 47.6876, 'eval_samples_per_second': 26.653, 'eval_steps_per_second': 3.334, 'epoch': 3.0}
{'loss': 0.1617, 'learning_rate': 6.853146853146854e-06, 'epoch': 3.15}
{'loss': 0.1205, 'learning_rate': 6.503496503496504e-06, 'epoch': 3.5}
{'loss': 0.0954, 'learning_rate': 6.153846153846155e-06, 'epoch': 3.85}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.06435351818799973, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 47.714, 'eval_samples_per_second': 26.638, 'eval_steps_per_second': 3.332, 'epoch': 4.0}
{'loss': 0.0778, 'learning_rate': 5.804195804195804e-06, 'epoch': 4.2}
{'loss': 0.0664, 'learning_rate': 5.4545454545454545e-06, 'epoch': 4.55}
{'loss': 0.0584, 'learning_rate': 5.1048951048951055e-06, 'epoch': 4.9}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.04016344994306564, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 46.3613, 'eval_samples_per_second': 27.415, 'eval_steps_per_second': 3.43, 'epoch': 5.0}
{'loss': 0.0503, 'learning_rate': 4.755244755244756e-06, 'epoch': 5.24}
{'loss': 0.0463, 'learning_rate': 4.405594405594406e-06, 'epoch': 5.59}
{'loss': 0.0403, 'learning_rate': 4.055944055944056e-06, 'epoch': 5.94}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.02843858301639557, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 46.3029, 'eval_samples_per_second': 27.45, 'eval_steps_per_second': 3.434, 'epoch': 6.0}
{'loss': 0.0371, 'learning_rate': 3.7062937062937064e-06, 'epoch': 6.29}
{'loss': 0.0339, 'learning_rate': 3.356643356643357e-06, 'epoch': 6.64}
{'loss': 0.0324, 'learning_rate': 3.006993006993007e-06, 'epoch': 6.99}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.022022198885679245, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 46.3236, 'eval_samples_per_second': 27.437, 'eval_steps_per_second': 3.432, 'epoch': 7.0}
{'loss': 0.0306, 'learning_rate': 2.6573426573426574e-06, 'epoch': 7.34}
{'loss': 0.029, 'learning_rate': 2.307692307692308e-06, 'epoch': 7.69}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.018407242372632027, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 46.298, 'eval_samples_per_second': 27.453, 'eval_steps_per_second': 3.434, 'epoch': 8.0}
{'loss': 0.0258, 'learning_rate': 1.9580419580419583e-06, 'epoch': 8.04}
{'loss': 0.0264, 'learning_rate': 1.6083916083916085e-06, 'epoch': 8.39}
{'loss': 0.0246, 'learning_rate': 1.258741258741259e-06, 'epoch': 8.74}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.016538305208086967, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 46.2815, 'eval_samples_per_second': 27.462, 'eval_steps_per_second': 3.435, 'epoch': 9.0}
{'loss': 0.0242, 'learning_rate': 9.090909090909091e-07, 'epoch': 9.09}
{'loss': 0.0241, 'learning_rate': 5.594405594405595e-07, 'epoch': 9.44}
{'loss': 0.0231, 'learning_rate': 2.097902097902098e-07, 'epoch': 9.79}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.01594206690788269, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 46.2887, 'eval_samples_per_second': 27.458, 'eval_steps_per_second': 3.435, 'epoch': 10.0}
{'train_runtime': 5808.5798, 'train_samples_per_second': 19.693, 'train_steps_per_second': 2.462, 'train_loss': 0.28984168146040057, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


TrainOutput(global_step=14300, training_loss=0.28984168146040057, metrics={'train_runtime': 5808.5798, 'train_samples_per_second': 19.693, 'train_steps_per_second': 2.462, 'train_loss': 0.28984168146040057, 'epoch': 10.0})

In [34]:
trainer_head_only.save_model(CFG_just_head.model_save_path)

In [35]:
trainer_head_only.evaluate(data_test, metric_key_prefix='test')

  0%|          | 0/177 [00:00<?, ?it/s]

  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'test_loss': 0.0167564507573843,
 'test_recall': 0.0,
 'test_precision': 0.0,
 'test_fbeta_score': nan,
 'test_runtime': 51.5318,
 'test_samples_per_second': 27.42,
 'test_steps_per_second': 3.435,
 'epoch': 10.0}

Roberta : The model doesn't learn with these hyperparameters and on head only