### Libraries

In [1]:
import numpy as np
import os
import pandas as pd

from datasets import Dataset
from functools import reduce
from seqeval.metrics import recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from functools import partial

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split

Todo: add more loops to the training

In [19]:
class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'microsoft/deberta-base'
    train_head_only = True

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'deberta-base')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch",
        per_device_train_batch_size=4,
        )
    model_save_path = os.path.join(target_dir, 'model')

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, add_prefix_space=True, use_fast=True)

### Labels

In [5]:
keys_to_flatten = ['labels', 'input_ids', 'attention_mask', 'org_word_ids','document']
data_path = os.path.join('..','data', 'raw', 'train.json')
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id, keys_to_flatten=keys_to_flatten)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:07<00:00,  1.28s/it]


In [20]:
def get_fbeta_score(precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

def compute_metrics(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
            }
        
        return results

In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Freezing the DeBERTa layers
if CFG.train_head_only:
    print('Training head only')
    for param in model.base_model.parameters():
        param.requires_grad = False
else:
    print('Training all layers')

Training head only


In [23]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

In [24]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/8580 [00:00<?, ?it/s]

{'loss': 0.6205, 'learning_rate': 4.708624708624709e-05, 'epoch': 0.17}
{'loss': 0.0151, 'learning_rate': 4.4172494172494175e-05, 'epoch': 0.35}
{'loss': 0.0092, 'learning_rate': 4.125874125874126e-05, 'epoch': 0.52}
{'loss': 0.0093, 'learning_rate': 3.834498834498835e-05, 'epoch': 0.7}
{'loss': 0.0068, 'learning_rate': 3.5431235431235434e-05, 'epoch': 0.87}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.00410860450938344, 'eval_recall': 0.0189873417721519, 'eval_precision': 0.15, 'eval_fbeta_score': 0.01964735516372796, 'eval_runtime': 156.7492, 'eval_samples_per_second': 8.108, 'eval_steps_per_second': 1.014, 'epoch': 1.0}
{'loss': 0.0058, 'learning_rate': 3.251748251748252e-05, 'epoch': 1.05}
{'loss': 0.0045, 'learning_rate': 2.9603729603729606e-05, 'epoch': 1.22}
{'loss': 0.0061, 'learning_rate': 2.6689976689976692e-05, 'epoch': 1.4}
{'loss': 0.0046, 'learning_rate': 2.377622377622378e-05, 'epoch': 1.57}
{'loss': 0.0052, 'learning_rate': 2.0862470862470865e-05, 'epoch': 1.75}
{'loss': 0.0052, 'learning_rate': 1.794871794871795e-05, 'epoch': 1.92}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.002976579125970602, 'eval_recall': 0.21835443037974683, 'eval_precision': 0.3791208791208791, 'eval_fbeta_score': 0.2219747587230883, 'eval_runtime': 151.8816, 'eval_samples_per_second': 8.368, 'eval_steps_per_second': 1.047, 'epoch': 2.0}
{'loss': 0.0041, 'learning_rate': 1.5034965034965034e-05, 'epoch': 2.1}
{'loss': 0.0053, 'learning_rate': 1.2121212121212122e-05, 'epoch': 2.27}
{'loss': 0.0047, 'learning_rate': 9.207459207459208e-06, 'epoch': 2.45}
{'loss': 0.0033, 'learning_rate': 6.2937062937062944e-06, 'epoch': 2.62}
{'loss': 0.0038, 'learning_rate': 3.3799533799533803e-06, 'epoch': 2.8}
{'loss': 0.0036, 'learning_rate': 4.662004662004662e-07, 'epoch': 2.97}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.002796303015202284, 'eval_recall': 0.26582278481012656, 'eval_precision': 0.3853211009174312, 'eval_fbeta_score': 0.2690317812269032, 'eval_runtime': 151.0298, 'eval_samples_per_second': 8.416, 'eval_steps_per_second': 1.053, 'epoch': 3.0}
{'train_runtime': 5663.547, 'train_samples_per_second': 6.059, 'train_steps_per_second': 1.515, 'train_loss': 0.0418458515878046, 'epoch': 3.0}


TrainOutput(global_step=8580, training_loss=0.0418458515878046, metrics={'train_runtime': 5663.547, 'train_samples_per_second': 6.059, 'train_steps_per_second': 1.515, 'train_loss': 0.0418458515878046, 'epoch': 3.0})

In [25]:
trainer.evaluate(data_test, metric_key_prefix='test')

  0%|          | 0/177 [00:00<?, ?it/s]

{'test_loss': 0.0026531913317739964,
 'test_recall': 0.2755102040816326,
 'test_precision': 0.48,
 'test_fbeta_score': 0.2800997506234414,
 'test_runtime': 178.1774,
 'test_samples_per_second': 7.93,
 'test_steps_per_second': 0.993,
 'epoch': 3.0}

In [26]:
trainer.save_model(CFG.model_save_path)

## Using the model

In [4]:
model_from_disk = AutoModelForTokenClassification.from_pretrained(CFG.model_save_path)

In [9]:
trainer_from_huggingface = Trainer(
    model=model_from_disk,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer_from_huggingface.evaluate(data_test)

  0%|          | 0/177 [00:00<?, ?it/s]

{'eval_loss': 0.0008325826493091881,
 'eval_recall': 0.8086734693877551,
 'eval_precision': 0.8386243386243386,
 'eval_fbeta_score': 0.8097858125368441,
 'eval_runtime': 56.4552,
 'eval_samples_per_second': 25.029,
 'eval_steps_per_second': 3.135}