### Libraries

In [134]:
import numpy as np
import os
import pandas as pd

from datasets import Dataset
from functools import reduce
from seqeval.metrics import recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

Roberta
 The inputs of the model take pieces of 512 contiguous tokens that may span over documents

In [103]:
MODEL_NAME = 'roberta-base'
TRAIN_HEAD_ONLY = False

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Loading the model and data

In [77]:
data_path = os.path.join('..','data', 'raw', 'train.json')

data = pd.read_json(data_path)

data_dataset = Dataset.from_pandas(data)

### Labels

In [104]:
LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']

label2id = {label: i for i, label in enumerate(LABELS_LIST)}
label2id['[PAD]'] = -100
id2label = {i: label for label, i in label2id.items()}

### Preprocessing

In [79]:
def encode_labels(example):
    """
    to be used with datasets.map() with batched=False
    
    Encodes the labels into integers.
    
    """
    labels = example['labels']
    encoded = [label2id[label] for label in labels]
    return {'labels': encoded}

In [80]:
data_labels_encoded = data_dataset.map(encode_labels)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [115]:
def tokenize_and_align(example, overlap_size = 0):
    """
    To be used with datasets.map() with batched=False

    Takes in 
        - example : an example from the datasets class
        - overlap_size: the number of tokens that overlap between two consecutive chunks
        
    outputs:
        - a Dict[]->List with columns:
            - of the bert tokenizer output
            - encoded labels
    """

    org_labels = example['labels']
    tokenized_inputs = tokenizer(example['tokens'], is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=True, stride=overlap_size, return_tensors='pt')
    tokenized_inputs.pop('overflow_to_sample_mapping')
    tokenized_inputs.pop('offset_mapping')
    
    new_labels = []
    org_word_ids_list = []
    document_id = []

    #iterating over chunks
    for i, chunk in enumerate(tokenized_inputs['input_ids']):
        ids_of_tokens = tokenized_inputs.word_ids(i)
        
        org_word_ids_list.append(ids_of_tokens)
        document_id.append(example['document'])
        #iterating over ids of tokens
        chunk_labels = []
        for id in ids_of_tokens:
            #if id=None, then it means it's some BERT token (CLS, SEP or PAD)
            if id is None:
                chunk_labels.append(-100)
            else:
                chunk_labels.append(org_labels[id])
        new_labels.append(chunk_labels)

    tokenized_inputs['labels'] = new_labels
    tokenized_inputs['org_word_ids'] = org_word_ids_list
    tokenized_inputs['document'] = document_id

    return tokenized_inputs

In [116]:
data_encoded_all = data_labels_encoded.map(tokenize_and_align, batched=False)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [129]:
def flatten_data(data, keys_to_flatten):

    data_flat = {}

    for key in keys_to_flatten:
        data_flat[key] = reduce(lambda x,y: x+y, data[key])


    return Dataset.from_dict(data_flat)

In [130]:
keys_to_flatten = ['labels', 'input_ids', 'attention_mask', 'org_word_ids','document']
data_flat = flatten_data(data_encoded_all, keys_to_flatten)

In [133]:
def get_fbeta_score(precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [LABELS_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [LABELS_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
            }
        
        return results

In [135]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(id2label), id2label=id2label, label2id=label2id)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [136]:
# Freezing the Roberta layers
if TRAIN_HEAD_ONLY:
    print('Training head only')
    for param in model.base_model.parameters():
        param.requires_grad = False
else:
    print('Training all layers')

Training all layers


In [137]:
encoded_data_split = data_flat.train_test_split(test_size=0.1)

In [138]:
model_save_name = 'all_Roberta_layers_new_preprocessing'
target_dir = 'model/trainer_'+model_save_name
model_save_path = 'model/model_'+model_save_name

In [139]:
training_args = TrainingArguments(
    output_dir=target_dir, 
    evaluation_strategy="epoch",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data_split["train"],
    eval_dataset=encoded_data_split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/4767 [00:00<?, ?it/s]

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model(model_save_path)

## Using the model

In [None]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained('zeinab-sheikhi/Roberta-pii-detection-baseline')