### Libraries

In [1]:
import numpy as np
import os
import pandas as pd

from datasets import Dataset
from functools import reduce
from seqeval.metrics import recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from functools import partial

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split

Roberta
 The inputs of the model take pieces of 512 contiguous tokens that may span over documents

In [2]:
class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'roberta-base'

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'roberta-base')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch"
        )
    model_save_path = os.path.join(target_dir, 'model')

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, add_prefix_space=True)


### Labels

In [4]:
keys_to_flatten = ['labels', 'input_ids', 'attention_mask', 'org_word_ids','document']
data_path = os.path.join('..','data', 'raw', 'train.json')
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id, keys_to_flatten=keys_to_flatten)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:07<00:00,  1.30s/it]


In [10]:
def get_fbeta_score(precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

def compute_metrics(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
            }
        
        return results

In [5]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

In [8]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/4290 [00:00<?, ?it/s]

Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0195, 'learning_rate': 4.4172494172494175e-05, 'epoch': 0.35}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0036, 'learning_rate': 3.834498834498835e-05, 'epoch': 0.7}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.002211528830230236, 'eval_recall': 0.6772151898734177, 'eval_precision': 0.4543524416135881, 'eval_fbeta_score': 0.6646756659897264, 'eval_runtime': 52.2848, 'eval_samples_per_second': 24.309, 'eval_steps_per_second': 3.041, 'epoch': 1.0}
{'loss': 0.0022, 'learning_rate': 3.251748251748252e-05, 'epoch': 1.05}
{'loss': 0.0016, 'learning_rate': 2.6689976689976692e-05, 'epoch': 1.4}
{'loss': 0.0013, 'learning_rate': 2.0862470862470865e-05, 'epoch': 1.75}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0008185721817426383, 'eval_recall': 0.9113924050632911, 'eval_precision': 0.7146401985111662, 'eval_fbeta_score': 0.9018427074551367, 'eval_runtime': 51.2759, 'eval_samples_per_second': 24.787, 'eval_steps_per_second': 3.101, 'epoch': 2.0}
{'loss': 0.0009, 'learning_rate': 1.5034965034965034e-05, 'epoch': 2.1}
{'loss': 0.0007, 'learning_rate': 9.207459207459208e-06, 'epoch': 2.45}
{'loss': 0.0005, 'learning_rate': 3.3799533799533803e-06, 'epoch': 2.8}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0006997723248787224, 'eval_recall': 0.9208860759493671, 'eval_precision': 0.7718832891246684, 'eval_fbeta_score': 0.9140993113446902, 'eval_runtime': 49.5996, 'eval_samples_per_second': 25.625, 'eval_steps_per_second': 3.206, 'epoch': 3.0}
{'train_runtime': 4926.483, 'train_samples_per_second': 6.966, 'train_steps_per_second': 0.871, 'train_loss': 0.0035678880979547966, 'epoch': 3.0}


TrainOutput(global_step=4290, training_loss=0.0035678880979547966, metrics={'train_runtime': 4926.483, 'train_samples_per_second': 6.966, 'train_steps_per_second': 0.871, 'train_loss': 0.0035678880979547966, 'epoch': 3.0})

In [15]:
trainer.evaluate(data_test, metric_key_prefix='test')

  0%|          | 0/177 [00:00<?, ?it/s]

{'test_loss': 0.0008325826493091881,
 'test_recall': 0.8086734693877551,
 'test_precision': 0.8386243386243386,
 'test_fbeta_score': 0.8097858125368441,
 'test_runtime': 55.2545,
 'test_samples_per_second': 25.573,
 'test_steps_per_second': 3.203,
 'epoch': 3.0}

In [10]:
trainer.save_model(CFG.model_save_path)

## Using the model

In [8]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained('zmilczarek/pii-detection-roberta-v2')

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [11]:
trainer_from_huggingface = Trainer(
    model=model_from_huggingface,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer_from_huggingface.evaluate(data_test)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/177 [00:00<?, ?it/s]

{'eval_loss': 0.0008325826493091881,
 'eval_recall': 0.8086734693877551,
 'eval_precision': 0.8386243386243386,
 'eval_fbeta_score': 0.8097858125368441,
 'eval_runtime': 56.0622,
 'eval_samples_per_second': 25.204,
 'eval_steps_per_second': 3.157}

## Roberta - finetuning just the HEAD

In [38]:
CFG_just_head = CFG
CFG_just_head.model_save_path = os.path.join(CFG_just_head.target_dir, 'model_just_head')
CFG_just_head.training_args = TrainingArguments(
    output_dir=os.path.join(CFG_just_head.target_dir, 'trainer_just_head'), 
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    learning_rate=1e-4)

In [39]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG_just_head.model_name, num_labels=len(CFG_just_head.id2label), id2label=CFG_just_head.id2label, label2id=CFG_just_head.label2id
)
for param in model.base_model.parameters():
        param.requires_grad = False
print('\nTraining head only')

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training head only


In [40]:
model.classifier.requires_grad = True

In [41]:
trainer_head_only = Trainer(
    model=model,
    args=CFG_just_head.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    compute_metrics=partial(compute_metrics, labels_list=CFG_just_head.LABELS_LIST),
)

In [42]:
trainer_head_only.train()

  0%|          | 0/7150 [00:00<?, ?it/s]

Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer_just_head/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.9902, 'learning_rate': 9.300699300699301e-05, 'epoch': 0.7}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.034129269421100616, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 48.2907, 'eval_samples_per_second': 26.32, 'eval_steps_per_second': 3.293, 'epoch': 1.0}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer_just_head/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0441, 'learning_rate': 8.601398601398601e-05, 'epoch': 1.4}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.012403772212564945, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 48.2478, 'eval_samples_per_second': 26.343, 'eval_steps_per_second': 3.295, 'epoch': 2.0}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer_just_head/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0208, 'learning_rate': 7.902097902097903e-05, 'epoch': 2.1}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer_just_head/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0153, 'learning_rate': 7.202797202797204e-05, 'epoch': 2.8}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.008569514378905296, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 48.2279, 'eval_samples_per_second': 26.354, 'eval_steps_per_second': 3.297, 'epoch': 3.0}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer_just_head/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0128, 'learning_rate': 6.503496503496504e-05, 'epoch': 3.5}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.006973118055611849, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 48.1394, 'eval_samples_per_second': 26.402, 'eval_steps_per_second': 3.303, 'epoch': 4.0}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/roberta-base/trainer_just_head/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0109, 'learning_rate': 5.8041958041958044e-05, 'epoch': 4.2}


KeyboardInterrupt: 