In [3]:
import os
import numpy as np
import torch 

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import recall_score, precision_score, accuracy_score
from functools import partial

2024-05-03 22:48:25.769668: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-03 22:48:25.769765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-03 22:48:25.928215: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Dataloader

In [4]:
from datasets import Dataset
from functools import partial, reduce
from transformers import AutoTokenizer
from pandas import read_json, read_csv
import os
from tqdm import tqdm


def encode_labels(example, label2id):
    """
    Encodes the labels into integers
    to be used with datasets.map() with batched=False
    
    Encodes the labels into integers.
    
    """
    labels = example['labels']
    encoded = [label2id[label] for label in labels]
    return {'labels': encoded}


def tokenize_and_align(example, tokenizer, with_labels=True, overlap_size=0):
    """
    Tokenizes the input and aligns the labels with the tokens
    To be used with datasets.map() with batched=False

    Takes in 
        - example : an example from the datasets class
        - overlap_size: the number of tokens that overlap between two consecutive chunks
        
    outputs:
        - a Dict[]->List with columns:
            - of the bert tokenizer output
            - encoded labels
    """

    if with_labels:
        org_labels = example['labels']

    tokenized_inputs = tokenizer(example['tokens'], is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=True, stride=overlap_size, return_tensors='pt')
    tokenized_inputs.pop('overflow_to_sample_mapping')
    tokenized_inputs.pop('offset_mapping')
    
    new_labels = []
    org_word_ids_list = []
    document_id = []
    
    # Iterating over chunks
    for i, chunk in enumerate(tokenized_inputs['input_ids']):
        ids_of_tokens = tokenized_inputs.word_ids(i)
        
        org_word_ids_list.append(ids_of_tokens)
        document_id.append(example['document'])

        if with_labels:
            # Iterating over ids of tokens
            chunk_labels = []
            for id in ids_of_tokens:
                # if id=None, then it means it's some BERT token (CLS, SEP or PAD)
                if id is None:
                    chunk_labels.append(-100)
                else:
                    chunk_labels.append(org_labels[id])
            new_labels.append(chunk_labels)

    if with_labels:
        tokenized_inputs['labels'] = new_labels
    
    tokenized_inputs['org_word_ids'] = org_word_ids_list
    tokenized_inputs['document'] = document_id

    return tokenized_inputs


def flatten_data(data, keys_to_flatten):
    """
    Flattens the rows of the datasets object for the keys_to_flatten columns

    Takes in:
        - data: a dataset object
        - keys_to_flatten: a list with the keys to flatten
    Outputs:
        - a dataset object with the keys_to_flatten columns
    """

    data_flat = {}

    for key in tqdm(keys_to_flatten):
        data_flat[key] = reduce(lambda x, y: x + y, data[key])

    return Dataset.from_dict(data_flat)


def preprocess_data(data, tokenizer, label2id={}, with_labels=True, overlap_size=0, keys_to_flatten=['input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids', 'document']):
    """
    Preprocesses the data
    
    Takes in 
        - data: a dataset object with columns 'document', 'tokens' (if with_labels=True, also has to have 'labels')
        - tokenizer: a tokenizer object
        - label2id: a dictionary with the labels and their corresponding ids. If with_labels=True, this has to be provided. By default, it's an empty dictionary.
        - with_labels: a boolean indicating if the data has labels. By default, it's True.
        - overlap_size: the number of tokens that overlap between two consecutive chunks. By default, it's 0.
        - keys_to_flatten : a list of columns to keep in the output dataset. By default, it's ['input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids', 'document']
        
    outputs:
        - a dataset object with keys_to_flatten columns
    """

    assert 'document' in data.column_names, "data has to have a 'document' column"
    assert 'tokens' in data.column_names, "data has to have a 'tokens' column"
    if with_labels:
        assert 'labels' in data.column_names, "data has to have a 'labels' column"
        assert label2id, "label2id has to be provided if with_labels=True"

    if with_labels:
        keys_to_flatten.append('labels')

        print("encoding the labels...")
        data = data.map(partial(encode_labels, label2id=label2id), batched=False)

    print("tokenizing and aligning...")
    data = data.map(partial(tokenize_and_align, tokenizer=tokenizer, overlap_size=overlap_size, with_labels=with_labels), batched=False)

    print("flattening the data...")
    data = flatten_data(data, keys_to_flatten)
    
    return data


def get_dataset_from_path(data_path):
    """
    Loads a dataset from a path and returns it as a datasets object

    Takes in 
        - data: a string with the path to the data (has to be a json or csv file)
    
    outputs:
        - a datasets object
    """

    filetype = data_path.split('.')[-1]
    data = None
    if filetype == 'json':
        data = read_json(data_path)
    elif filetype == 'csv':
        data = read_csv(data_path)
    else:
        raise ValueError('Filetype not supported. Suuported filetypes are: json, csv')
    
    data = Dataset.from_pandas(data)

    return data


def get_train_val_test_split(data, seed, val_size=0.1, test_size=0.1):
    """
    Takes in:
        - data: a dataset object
        - seed: the seed for the random split
        - val_size: the size of the validation set
        - test_size: the size of the test set
    Outputs:
        - a tuple with data_train, data_val, data_test
    """

    data = data.train_test_split(test_size=test_size, seed=seed)
    data_train_val = data['train'].train_test_split(test_size=val_size, seed=seed)

    return data_train_val['train'], data_train_val['test'], data['test']


In [5]:
def get_fbeta_score(precision, recall, beta=5.0):
    b2 = beta ** 2
    return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


def compute_metrics(p, labels_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    fbeta_score = get_fbeta_score(precision, recall)

    results = {
        'recall': recall,
        'precision': precision,
        'fbeta_score': fbeta_score
    }
        
    return results


In [6]:
# Model configuration

class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = True

    # model checkpoint
    model_name = 'bert-base-uncased'
    train_head_only = False

    # path to the directory where the model will be saved
    local_path = "/kaggle/working/"
    target_dir = os.path.join(local_path,'..','models', 'bert')
    model_save_path = os.path.join(target_dir, 'lr'), 

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer_args'), 
        evaluation_strategy="epoch", 
        learning_rate=1e-4, 
        weight_decay=0.1, 
        num_train_epochs=10,
        )

In [7]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
model = AutoModelForTokenClassification.from_pretrained(CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
data_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:23<00:00,  3.99s/it]


In [9]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

Training the BERT model with lr=0.0001

In [10]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss,Recall,Precision,Fbeta Score
1,0.0189,0.003188,0.059946,0.349206,0.061918
2,0.0037,0.001435,0.749319,0.766017,0.749948
3,0.0025,0.002142,0.53406,0.771654,0.54046
4,0.0006,0.001756,0.814714,0.753149,0.81216
5,0.0005,0.002034,0.839237,0.627291,0.828471
6,0.0007,0.002254,0.809264,0.671946,0.802953
7,0.0002,0.002013,0.798365,0.827684,0.799454
8,0.0002,0.002019,0.803815,0.833333,0.804911
9,0.0001,0.002191,0.798365,0.834758,0.799706
10,0.0001,0.002257,0.811989,0.818681,0.812244




TrainOutput(global_step=6490, training_loss=0.0022503415045137583, metrics={'train_runtime': 6164.399, 'train_samples_per_second': 16.834, 'train_steps_per_second': 1.053, 'total_flos': 2.711819644747776e+16, 'train_loss': 0.0022503415045137583, 'epoch': 10.0})

In [12]:
trainer.evaluate(data_test)



{'eval_loss': 0.00291440193541348,
 'eval_recall': 0.778705636743215,
 'eval_precision': 0.7936170212765957,
 'eval_fbeta_score': 0.7792687826436321,
 'eval_runtime': 31.5543,
 'eval_samples_per_second': 40.628,
 'eval_steps_per_second': 2.567,
 'epoch': 10.0}

In [21]:
lr_res = 'lr' + str(CFG.training_args.learning_rate)
model_save_path = os.path.join(CFG.target_dir, lr_res) 

In [24]:
trainer.save_model(model_save_path)

In [25]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
token = "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"
model.push_to_hub("zeinab-sheikhi/bert-pii-detection-baseline", token=token)

README.md:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/zeinab-sheikhi/bert-pii-detection-baseline/commit/f06fb1b2c1da6f63c061cd3b3ee3ada6d4b7acff', commit_message='Upload BertForTokenClassification', commit_description='', oid='f06fb1b2c1da6f63c061cd3b3ee3ada6d4b7acff', pr_url=None, pr_revision=None, pr_num=None)