### Libraries

In [1]:
import numpy as np
import os
import pandas as pd

from datasets import Dataset
from functools import reduce
from seqeval.metrics import recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from functools import partial

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Todo: add more loops to the training

In [2]:
class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'microsoft/deberta-base'
    train_head_only = True

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'deberta-base')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch",
        per_device_train_batch_size=4,
        num_train_epochs=10,
        )
    model_save_path = os.path.join(target_dir, 'model')

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, add_prefix_space=True, use_fast=True)

### Labels

In [3]:
keys_to_flatten = ['labels', 'input_ids', 'attention_mask', 'org_word_ids','document']
data_path = os.path.join('..','data', 'raw', 'train.json')
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id, keys_to_flatten=keys_to_flatten)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:06<00:00,  1.10s/it]


In [7]:
def get_fbeta_score(precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

def compute_metrics(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
            }
        
        return results

In [32]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Freezing the DeBERTa layers
if CFG.train_head_only:
    print('Training head only')
    for param in model.base_model.parameters():
        param.requires_grad = False
else:
    print('Training all layers')

Training head only


In [4]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

In [35]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/28600 [00:00<?, ?it/s]

Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.6611, 'learning_rate': 4.912587412587413e-05, 'epoch': 0.17}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0153, 'learning_rate': 4.825174825174825e-05, 'epoch': 0.35}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0093, 'learning_rate': 4.7377622377622384e-05, 'epoch': 0.52}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0096, 'learning_rate': 4.6503496503496505e-05, 'epoch': 0.7}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0068, 'learning_rate': 4.562937062937063e-05, 'epoch': 0.87}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.00403813598677516, 'eval_recall': 0.015822784810126583, 'eval_precision': 0.11627906976744186, 'eval_fbeta_score': 0.016366612111292967, 'eval_runtime': 140.0694, 'eval_samples_per_second': 9.074, 'eval_steps_per_second': 1.135, 'epoch': 1.0}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0057, 'learning_rate': 4.475524475524476e-05, 'epoch': 1.05}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-3500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0042, 'learning_rate': 4.388111888111888e-05, 'epoch': 1.22}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-4000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0058, 'learning_rate': 4.300699300699301e-05, 'epoch': 1.4}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-4500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0043, 'learning_rate': 4.213286713286714e-05, 'epoch': 1.57}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-5000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0048, 'learning_rate': 4.125874125874126e-05, 'epoch': 1.75}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-5500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0045, 'learning_rate': 4.038461538461539e-05, 'epoch': 1.92}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.002648464171215892, 'eval_recall': 0.3037974683544304, 'eval_precision': 0.4266666666666667, 'eval_fbeta_score': 0.3072, 'eval_runtime': 139.776, 'eval_samples_per_second': 9.093, 'eval_steps_per_second': 1.138, 'epoch': 2.0}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-6000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0036, 'learning_rate': 3.9510489510489516e-05, 'epoch': 2.1}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-6500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0046, 'learning_rate': 3.8636363636363636e-05, 'epoch': 2.27}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-7000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0041, 'learning_rate': 3.776223776223776e-05, 'epoch': 2.45}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-7500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0028, 'learning_rate': 3.688811188811189e-05, 'epoch': 2.62}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-8000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0032, 'learning_rate': 3.601398601398602e-05, 'epoch': 2.8}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/deberta-base/trainer/checkpoint-8500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0028, 'learning_rate': 3.5139860139860145e-05, 'epoch': 2.97}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0021899312268942595, 'eval_recall': 0.3639240506329114, 'eval_precision': 0.4872881355932203, 'eval_fbeta_score': 0.36750245821042277, 'eval_runtime': 139.6315, 'eval_samples_per_second': 9.103, 'eval_steps_per_second': 1.139, 'epoch': 3.0}
{'loss': 0.0039, 'learning_rate': 3.4265734265734265e-05, 'epoch': 3.15}
{'loss': 0.0025, 'learning_rate': 3.339160839160839e-05, 'epoch': 3.32}
{'loss': 0.0031, 'learning_rate': 3.251748251748252e-05, 'epoch': 3.5}
{'loss': 0.0024, 'learning_rate': 3.164335664335665e-05, 'epoch': 3.67}
{'loss': 0.0035, 'learning_rate': 3.0769230769230774e-05, 'epoch': 3.85}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0020445690024644136, 'eval_recall': 0.5, 'eval_precision': 0.5047923322683706, 'eval_fbeta_score': 0.5001826372823572, 'eval_runtime': 139.5856, 'eval_samples_per_second': 9.106, 'eval_steps_per_second': 1.139, 'epoch': 4.0}
{'loss': 0.0024, 'learning_rate': 2.9895104895104898e-05, 'epoch': 4.02}
{'loss': 0.0027, 'learning_rate': 2.9020979020979022e-05, 'epoch': 4.2}
{'loss': 0.0035, 'learning_rate': 2.8146853146853146e-05, 'epoch': 4.37}
{'loss': 0.0022, 'learning_rate': 2.7272727272727273e-05, 'epoch': 4.55}
{'loss': 0.0029, 'learning_rate': 2.6398601398601404e-05, 'epoch': 4.72}
{'loss': 0.0028, 'learning_rate': 2.5524475524475528e-05, 'epoch': 4.9}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0019473452121019363, 'eval_recall': 0.5411392405063291, 'eval_precision': 0.5294117647058824, 'eval_fbeta_score': 0.5406785844582269, 'eval_runtime': 139.2079, 'eval_samples_per_second': 9.13, 'eval_steps_per_second': 1.142, 'epoch': 5.0}
{'loss': 0.0021, 'learning_rate': 2.465034965034965e-05, 'epoch': 5.07}
{'loss': 0.0024, 'learning_rate': 2.377622377622378e-05, 'epoch': 5.24}
{'loss': 0.0027, 'learning_rate': 2.2902097902097902e-05, 'epoch': 5.42}
{'loss': 0.0031, 'learning_rate': 2.202797202797203e-05, 'epoch': 5.59}
{'loss': 0.0019, 'learning_rate': 2.1153846153846154e-05, 'epoch': 5.77}
{'loss': 0.0028, 'learning_rate': 2.027972027972028e-05, 'epoch': 5.94}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0018824865110218525, 'eval_recall': 0.5822784810126582, 'eval_precision': 0.5443786982248521, 'eval_fbeta_score': 0.5807234765719835, 'eval_runtime': 139.6128, 'eval_samples_per_second': 9.104, 'eval_steps_per_second': 1.139, 'epoch': 6.0}
{'loss': 0.0033, 'learning_rate': 1.9405594405594408e-05, 'epoch': 6.12}
{'loss': 0.0024, 'learning_rate': 1.8531468531468532e-05, 'epoch': 6.29}
{'loss': 0.0025, 'learning_rate': 1.7657342657342656e-05, 'epoch': 6.47}
{'loss': 0.0021, 'learning_rate': 1.6783216783216786e-05, 'epoch': 6.64}
{'loss': 0.0026, 'learning_rate': 1.590909090909091e-05, 'epoch': 6.82}
{'loss': 0.0021, 'learning_rate': 1.5034965034965034e-05, 'epoch': 6.99}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0018466755282133818, 'eval_recall': 0.620253164556962, 'eval_precision': 0.5632183908045977, 'eval_fbeta_score': 0.6178467507274491, 'eval_runtime': 139.7428, 'eval_samples_per_second': 9.095, 'eval_steps_per_second': 1.138, 'epoch': 7.0}
{'loss': 0.0028, 'learning_rate': 1.4160839160839163e-05, 'epoch': 7.17}
{'loss': 0.0021, 'learning_rate': 1.3286713286713287e-05, 'epoch': 7.34}
{'loss': 0.0024, 'learning_rate': 1.2412587412587414e-05, 'epoch': 7.52}
{'loss': 0.0028, 'learning_rate': 1.153846153846154e-05, 'epoch': 7.69}
{'loss': 0.0023, 'learning_rate': 1.0664335664335665e-05, 'epoch': 7.87}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0017761330818757415, 'eval_recall': 0.6170886075949367, 'eval_precision': 0.5769230769230769, 'eval_fbeta_score': 0.6154406409322651, 'eval_runtime': 527.5534, 'eval_samples_per_second': 2.409, 'eval_steps_per_second': 0.301, 'epoch': 8.0}
{'loss': 0.0018, 'learning_rate': 9.79020979020979e-06, 'epoch': 8.04}
{'loss': 0.0021, 'learning_rate': 8.916083916083918e-06, 'epoch': 8.22}
{'loss': 0.0027, 'learning_rate': 8.041958041958042e-06, 'epoch': 8.39}
{'loss': 0.0021, 'learning_rate': 7.167832167832168e-06, 'epoch': 8.57}
{'loss': 0.0024, 'learning_rate': 6.2937062937062944e-06, 'epoch': 8.74}
{'loss': 0.0023, 'learning_rate': 5.419580419580419e-06, 'epoch': 8.92}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0017441479722037911, 'eval_recall': 0.6170886075949367, 'eval_precision': 0.5855855855855856, 'eval_fbeta_score': 0.6158144054415158, 'eval_runtime': 139.8976, 'eval_samples_per_second': 9.085, 'eval_steps_per_second': 1.137, 'epoch': 9.0}
{'loss': 0.0025, 'learning_rate': 4.5454545454545455e-06, 'epoch': 9.09}
{'loss': 0.0017, 'learning_rate': 3.6713286713286715e-06, 'epoch': 9.27}
{'loss': 0.0026, 'learning_rate': 2.7972027972027974e-06, 'epoch': 9.44}


KeyboardInterrupt: 

I had to force-quit the training loop, so below I am restarting from checkpoint

In [38]:
trainer.train(resume_from_checkpoint = True)

  0%|          | 0/28600 [00:00<?, ?it/s]

{'loss': 0.0022, 'learning_rate': 1.9230769230769234e-06, 'epoch': 9.62}
{'loss': 0.0019, 'learning_rate': 1.0489510489510491e-06, 'epoch': 9.79}
{'loss': 0.0022, 'learning_rate': 1.7482517482517484e-07, 'epoch': 9.97}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0017485427670180798, 'eval_recall': 0.6170886075949367, 'eval_precision': 0.5701754385964912, 'eval_fbeta_score': 0.6151419558359621, 'eval_runtime': 139.7541, 'eval_samples_per_second': 9.095, 'eval_steps_per_second': 1.138, 'epoch': 10.0}
{'train_runtime': 1013.1027, 'train_samples_per_second': 112.911, 'train_steps_per_second': 28.23, 'train_loss': 0.0001205749507550593, 'epoch': 10.0}


TrainOutput(global_step=28600, training_loss=0.0001205749507550593, metrics={'train_runtime': 1013.1027, 'train_samples_per_second': 112.911, 'train_steps_per_second': 28.23, 'train_loss': 0.0001205749507550593, 'epoch': 10.0})

In [39]:
trainer.evaluate(data_test, metric_key_prefix='test')

  0%|          | 0/177 [00:00<?, ?it/s]

{'test_loss': 0.001627293648198247,
 'test_recall': 0.6020408163265306,
 'test_precision': 0.6210526315789474,
 'test_fbeta_score': 0.6027504911591355,
 'test_runtime': 157.2376,
 'test_samples_per_second': 8.986,
 'test_steps_per_second': 1.126,
 'epoch': 10.0}

In [44]:
data_train_small = data_train.shuffle(seed=CFG.seed).select(range(1000))

In [46]:
trainer.evaluate(data_train_small, metric_key_prefix='train')

{'train_loss': 0.0022829105146229267,
 'train_recall': 0.5703703703703704,
 'train_precision': 0.6875,
 'train_fbeta_score': 0.5741324921135647,
 'train_runtime': 109.9517,
 'train_samples_per_second': 9.095,
 'train_steps_per_second': 1.137,
 'epoch': 10.0}

In [40]:
trainer.save_model(CFG.model_save_path)

## Using the model

In [5]:
model_from_huggingafce = AutoModelForTokenClassification.from_pretrained('zmilczarek/pii-detection-deberta-v1')

In [8]:
trainer_from_huggingafce = Trainer(
    model=model_from_huggingafce,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer_from_huggingafce.evaluate(data_test)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/177 [00:00<?, ?it/s]

{'eval_loss': 0.001627293648198247,
 'eval_recall': 0.6020408163265306,
 'eval_precision': 0.6210526315789474,
 'eval_fbeta_score': 0.6027504911591355,
 'eval_runtime': 159.7179,
 'eval_samples_per_second': 8.847,
 'eval_steps_per_second': 1.108}

In [9]:
trainer_from_huggingafce.evaluate(data_eval)

  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.0017485427670180798,
 'eval_recall': 0.6170886075949367,
 'eval_precision': 0.5701754385964912,
 'eval_fbeta_score': 0.6151419558359621,
 'eval_runtime': 141.0742,
 'eval_samples_per_second': 9.009,
 'eval_steps_per_second': 1.127}