Libraries

# idea: BERT Tokenizer accepts texts with len> 512, it just gives a warning!!!!!
    - use bert tokenizer without truncation
    - make chunks
    - flatten

In [1]:
import torch 
from transformers import AdamW, AutoTokenizer, AutoModelForTokenClassification
import os
import pandas as pd
import tensorflow as tf
from datasets import Dataset

Getting the model

In [2]:
MODEL_NAME = 'bert-base-uncased'


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [3]:
data_path = os.path.join('..','data', 'raw', 'train.json')

data = pd.read_json(data_path)

data_dataset = Dataset.from_pandas(data)
#data_dataset = data_dataset.remove_columns(['full_text','trailing_whitespace', 'document'])

## Preprocessing

### Label encodings

In [4]:
LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']

label2id = {label: i for i, label in enumerate(LABELS_LIST)}
label2id['[PAD]'] = -100
id2label = {i: label for label, i in label2id.items()}
id2label

{0: 'B-NAME_STUDENT',
 1: 'B-EMAIL',
 2: 'B-USERNAME',
 3: 'B-ID_NUM',
 4: 'B-PHONE_NUM',
 5: 'B-URL_PERSONAL',
 6: 'B-STREET_ADDRESS',
 7: 'I-NAME_STUDENT',
 8: 'I-EMAIL',
 9: 'I-USERNAME',
 10: 'I-ID_NUM',
 11: 'I-PHONE_NUM',
 12: 'I-URL_PERSONAL',
 13: 'I-STREET_ADDRESS',
 14: 'O',
 -100: '[PAD]'}

In [5]:
def encode_labels(example):
    """
    to be used with datasets.map() with batched=False
    
    Encodes the labels into integers.
    
    """
    labels = example['labels']
    encoded = [label2id[label] for label in labels]
    return {'labels': encoded}

In [6]:
data_labels_encoded = data_dataset.map(encode_labels)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

### Encoding tokens

In [8]:
def encode_tokens(examples):
    """
    o be used with datasets.map() with batched=True

    Expects a batch of examples to be tokenized and returns the tokenized examples.
    
    Warning: Doesn't align labels with tokens. If the text is turned into chunks, each chunk will have the labels for the full text in it's row.
    """
    result = tokenizer(examples["full_text"], 
                       padding="max_length",
                       max_length=512, 
                       truncation=True, 
                       return_overflowing_tokens=True, 
                       return_offsets_mapping=True, 
                       stride=10, 
                       return_tensors='pt')
    
    sample_map = result.pop('overflow_to_sample_mapping')
    result['overflow_to_sample_mapping'] = sample_map
    print(sample_map)
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

tokenized_dataset = data_labels_encoded.map(encode_tokens, batched=True)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tensor([  0,   0,   1,  ..., 998, 999, 999])
tensor([  0,   1,   2,  ..., 998, 999, 999])
tensor([  0,   0,   1,  ..., 998, 999, 999])
tensor([  0,   0,   1,  ..., 998, 999, 999])
tensor([  0,   0,   0,  ..., 998, 999, 999])
tensor([  0,   0,   1,  ..., 998, 998, 999])
tensor([  0,   0,   1,  ..., 805, 806, 806])


In [62]:
import numpy as np

def tokenize_and_align(example, sub_word_labeling = False, overlap_size = 0):
    """
    To be used with datasets.map() with batched=False

    Takes in 
        - example : an example from the datasets class
        - sub_word_labeling: if True, labels are assigned to subwords of words. If False, subwords (other than the 1st one) are assigned the label -100
        - overlap_size: the number of tokens that overlap between two consecutive chunks
        
    outputs:
        - a Dict[]->List with columns:
            - of the bert tokenizer output
            - encoded labels
    """

    org_labels = example['labels']
    tokenized_inputs = tokenizer(example['tokens'], is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=True, stride=overlap_size, return_tensors='pt')
    new_labels = []
    # Update the labels with the new tokenization

    it = 0

    for doc_offset in tokenized_inputs['offset_mapping']:
        aligned_labels = []
        
        if not sub_word_labeling:
            arr_offset = np.array(doc_offset)
            #making a list of -100 of the same length as the number of BERT tokens in the chunk
            arr_aligned_labels = np.ones(len(doc_offset), dtype=int) * (-100)

            # calculating the number of labels present in the chunk
            nb_labels_in_chunk = len(arr_offset[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)])

            # assigning the labels to the corresponding tokens, meaning tokens with offset (0,y)
            arr_aligned_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = org_labels[:nb_labels_in_chunk]

            # removing the labels that have been assigned to the chunk from the list of labels
            org_labels = org_labels[nb_labels_in_chunk:]

            new_labels.append(arr_aligned_labels.tolist())
        
        
        # TODO : subword is slow

        # dumb solution with for-loops

        if sub_word_labeling:
            prev_offset = (1,1)
            for x,y in doc_offset:
                #if the offset is (0,0), it means it's a special token (CLS, SEP, etc.)
                if x == 0 and y == 0:
                    aligned_labels.append(-100)
                #if the offset is (0,y), it means it's the first subword of a word
                elif x == 0 and y !=0:
                    aligned_labels.append(org_labels[it])
                    it+=1
                #if the offset is (x,y) with x!=0, it means it's a subword of a word and the most recent label should be assigned to it
                elif x!=0 and prev_offset[1]==x:
                    aligned_labels.append(aligned_labels[-1])
                prev_offset = (x,y)
            new_labels.append(aligned_labels)


    tokenized_inputs["labels"] = new_labels
    tokenized_inputs.pop("offset_mapping")
    tokenized_inputs.pop("overflow_to_sample_mapping")

    return tokenized_inputs
    

data_chunked_encoded = data_labels_encoded.map(lambda x: tokenize_and_align(x,sub_word_labeling=True), batched=False)
#time to tokenize and chunk WITH subword labeling : 1m58s

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [64]:
print("labels aligned with tokens, with subword labeling")
print(tokenizer.convert_ids_to_tokens(data_chunked_encoded['input_ids'][0][0][2:14]))
print(data_chunked_encoded['labels'][0][0][2:14])

labels aligned with tokens, with subword labeling
['thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal']
[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0]


In [65]:
data_chunked_encoded_sub_labeling_false = data_labels_encoded.map(lambda x: tokenize_and_align(x,sub_word_labeling=False), batched=False)
#time to tokenize and chunk WITHOUT subword labeling : 46s

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [66]:
print("labels aligned with tokens, without subword labeling")
print(tokenizer.convert_ids_to_tokens(data_chunked_encoded_sub_labeling_false['input_ids'][0][0][2:14]))
print(data_chunked_encoded_sub_labeling_false['labels'][0][0][2:14])

labels aligned with tokens, without subword labeling
['thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal']
[14, 14, 14, 14, -100, 14, 14, -100, 14, 14, 0, -100]


### Flattening the rows

In [67]:
from functools import reduce

#flatten the dataset
data_flat = {}

keys = ['labels', 'input_ids', 'token_type_ids', 'attention_mask']

for key in keys:
    data_flat[key] = reduce(lambda x,y: x+y, data_chunked_encoded[key])

data_flat = Dataset.from_dict(data_flat)

In [68]:
len(data_flat['labels'][0])

512

In [70]:
len(data_flat['input_ids'][0])

512

### Previous preprocessing functions

In [13]:
def tokenize_and_preserve_labels(examples):
    labels = []
    tokenized_sentence = []
    for word, label in zip(examples['tokens'], examples['labels']):
        #tokenizes the word using BERT's subword tokenizer
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        #adds the same label to all the subwords of the word
        labels.extend([label] * n_subwords)
    examples['tokens'] = tokenized_sentence
    examples['labels'] = labels
    return examples

In [14]:
tokenized_data = data_dataset.map(tokenize_and_preserve_labels,batched=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [8]:
def chunk(examples, block_size=510, sliding_window=False):
    tokenized_sentences = []
    labels = []
    for i in range(0, len(examples['tokens']), block_size):
        chunk_token = examples['tokens'][i:i+block_size]
        chunk_label = examples['labels'][i:i+block_size]
        if len(chunk_token) < block_size:
            chunk_token += ['[PAD]'] * (block_size - len(chunk_token))
            chunk_label += ['[PAD]'] * (block_size - len(chunk_label))
        tokenized_sentences.append(chunk_token)
        labels.append(chunk_label)
    return {'tokens': tokenized_sentences, 'labels': labels}

In [9]:
chunked_data = tokenized_data.map(chunk, batched=False)
chunked_data = chunked_data.remove_columns(['document', 'full_text','trailing_whitespace'])

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [10]:
from functools import reduce
flat_chunks_tokens = list(reduce(lambda x, y: x + y, chunked_data['tokens'], []))
flat_chunks_labels = list(reduce(lambda x, y: x + y, chunked_data['labels'], []))

In [11]:
chunked_flattened_data = Dataset.from_dict({'tokens': flat_chunks_tokens, 'labels': flat_chunks_labels})

Setting the '[PAD]' label to be encoded as -100 so that it's ignored by the loss function

({'B-NAME_STUDENT': 0,
  'B-EMAIL': 1,
  'B-USERNAME': 2,
  'B-ID_NUM': 3,
  'B-PHONE_NUM': 4,
  'B-URL_PERSONAL': 5,
  'B-STREET_ADDRESS': 6,
  'I-NAME_STUDENT': 7,
  'I-EMAIL': 8,
  'I-USERNAME': 9,
  'I-ID_NUM': 10,
  'I-PHONE_NUM': 11,
  'I-URL_PERSONAL': 12,
  'I-STREET_ADDRESS': 13,
  'O': 14,
  '[PAD]': -100},
 {0: 'B-NAME_STUDENT',
  1: 'B-EMAIL',
  2: 'B-USERNAME',
  3: 'B-ID_NUM',
  4: 'B-PHONE_NUM',
  5: 'B-URL_PERSONAL',
  6: 'B-STREET_ADDRESS',
  7: 'I-NAME_STUDENT',
  8: 'I-EMAIL',
  9: 'I-USERNAME',
  10: 'I-ID_NUM',
  11: 'I-PHONE_NUM',
  12: 'I-URL_PERSONAL',
  13: 'I-STREET_ADDRESS',
  14: 'O',
  -100: '[PAD]'})

In [14]:
chunked_flattened_data

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 12812
})

In [15]:
x = list(map(lambda x: ' '.join(x), chunked_flattened_data['tokens'][:3]))

In [16]:
x2 = list(map(lambda x: x.replace(' ##', ''), x))

In [17]:
x2

["design thinking for innovation reflexion - avril 2021 - nathalie sylla challenge & selection the tool i use to help all stakeholders finding their way through the complexity of a project is the mind map . what exactly is a mind map ? according to the definition of buzan t . and buzan b . ( 1999 , dessine - moi l ' intelligence . paris : les editions d ' organisation . ) , the mind map ( or heuristic diagram ) is a graphic representation technique that follows the natural functioning of the mind and allows the brain ' s potential to be released . cf annex1 this tool has many advantages : • it is accessible to all and does not require significant material investment and can be done quickly • it is scalable • it allows categorization and linking of information • it can be applied to any type of situation : notetaking , problem solving , analysis , creation of new ideas • it is suitable for all people and is easy to learn • it is fun and encourages exchanges • it makes visible the dimens

In [18]:
tokenizer.decode(tokenizer(x2[0])['input_ids'])

"[CLS] design thinking for innovation reflexion - avril 2021 - nathalie sylla challenge & selection the tool i use to help all stakeholders finding their way through the complexity of a project is the mind map. what exactly is a mind map? according to the definition of buzan t. and buzan b. ( 1999, dessine - moi l'intelligence. paris : les editions d'organisation. ), the mind map ( or heuristic diagram ) is a graphic representation technique that follows the natural functioning of the mind and allows the brain's potential to be released. cf annex1 this tool has many advantages : • it is accessible to all and does not require significant material investment and can be done quickly • it is scalable • it allows categorization and linking of information • it can be applied to any type of situation : notetaking, problem solving, analysis, creation of new ideas • it is suitable for all people and is easy to learn • it is fun and encourages exchanges • it makes visible the dimension of projec

In [19]:
def encode_tokens(example):
    detokenized = list(map(lambda x: ' '.join(x), example['tokens']))
    detokenized = list(map(lambda x: x.replace(' ##', ''), detokenized))
    encoded = tokenizer(detokenized, truncation=True, is_split_into_words = False, return_tensors='pt')
    return encoded

In [20]:
encoded_data = chunked_flattened_data.map(encode_tokens, batched=True)

Map:   0%|          | 0/12812 [00:00<?, ? examples/s]

In [22]:
encoded_data = encoded_data.map(encode_labels, batched=False)

Map:   0%|          | 0/12812 [00:00<?, ? examples/s]

In [23]:
encoded_data

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12812
})

In [24]:
encoded_data['tokens'][0][10:19], encoded_data['labels'][0][11:20], encoded_data['input_ids'][0][11:20]

(['-', 'nat', '##hal', '##ie', 'sy', '##lla', 'challenge', '&', 'selection'],
 [14, 0, 0, 0, 7, 7, 14, 14, 14],
 [1011, 14085, 8865, 2666, 25353, 4571, 4119, 1004, 4989])

In [25]:
encoded_data_split = encoded_data.train_test_split(test_size=0.1)

In [26]:
tokenizer.decode(encoded_data['input_ids'][0][11:20])

'- nathalie sylla challenge & selection'

In [27]:
#!pip install seqeval

## Model training & evaluation

In [74]:
import evaluate

seqeval = evaluate.load("seqeval")

In [29]:
import numpy as np


def compute_metrics(p):
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [LABELS_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABELS_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [30]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
#testing predictions


In [32]:
# training_args = TrainingArguments(
#     output_dir="model_baseline_v0.1",
#     learning_rate=2e-3,
#     per_device_train_batch_size=20,
#     per_device_eval_batch_size=20,
#     num_train_epochs=2,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

training_args = TrainingArguments(output_dir="model/test_trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data_split["train"],
    eval_dataset=encoded_data_split["test"],
    tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/4326 [00:00<?, ?it/s]

{'loss': 0.0271, 'learning_rate': 4.4220989366620436e-05, 'epoch': 0.35}
{'loss': 0.0091, 'learning_rate': 3.844197873324087e-05, 'epoch': 0.69}


  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.002221583155915141, 'eval_precision': 0.7661691542288557, 'eval_recall': 0.6984126984126984, 'eval_f1': 0.7307236061684459, 'eval_accuracy': 0.9994324666757416, 'eval_runtime': 52.3205, 'eval_samples_per_second': 24.503, 'eval_steps_per_second': 3.077, 'epoch': 1.0}
{'loss': 0.0034, 'learning_rate': 3.266296809986131e-05, 'epoch': 1.04}
{'loss': 0.0023, 'learning_rate': 2.688395746648174e-05, 'epoch': 1.39}
{'loss': 0.0021, 'learning_rate': 2.1104946833102173e-05, 'epoch': 1.73}


  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.0013160010566934943, 'eval_precision': 0.8074245939675174, 'eval_recall': 0.7891156462585034, 'eval_f1': 0.7981651376146789, 'eval_accuracy': 0.9995937215317117, 'eval_runtime': 52.421, 'eval_samples_per_second': 24.456, 'eval_steps_per_second': 3.071, 'epoch': 2.0}
{'loss': 0.0013, 'learning_rate': 1.5325936199722607e-05, 'epoch': 2.08}
{'loss': 0.0005, 'learning_rate': 9.546925566343042e-06, 'epoch': 2.43}
{'loss': 0.0007, 'learning_rate': 3.7679149329634766e-06, 'epoch': 2.77}


  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.0011914735659956932, 'eval_precision': 0.8928571428571429, 'eval_recall': 0.7369614512471655, 'eval_f1': 0.8074534161490683, 'eval_accuracy': 0.9996481712233379, 'eval_runtime': 52.1851, 'eval_samples_per_second': 24.566, 'eval_steps_per_second': 3.085, 'epoch': 3.0}
{'train_runtime': 11353.2091, 'train_samples_per_second': 3.047, 'train_steps_per_second': 0.381, 'train_loss': 0.0054270451405342675, 'epoch': 3.0}


TrainOutput(global_step=4326, training_loss=0.0054270451405342675, metrics={'train_runtime': 11353.2091, 'train_samples_per_second': 3.047, 'train_steps_per_second': 0.381, 'train_loss': 0.0054270451405342675, 'epoch': 3.0})

In [33]:
model.save_pretrained('model/model_baseline_v0.0')

In [79]:
sen = 'my name is John Smith and my email is john.smith@gmail.com. I live in 1234 Elm Street. My phone number is 123-456-7890. My username is johnsmith123. My ID number is 123456789. My personal website is www.johnsmith.com.'

model = model.to('cpu')

predictions = model(**tokenizer(sen, return_tensors='pt'))
predictions = np.argmax(predictions.logits.detach().numpy(), axis=2)

In [80]:
bert_tokens = tokenizer.tokenize(sen)
bert_tokens = ['[CLS]'] + bert_tokens + ['[SEP]']

In [81]:
[(id2label[pred], token) for pred, token in zip(predictions[0], bert_tokens)]

[('O', '[CLS]'),
 ('O', 'my'),
 ('O', 'name'),
 ('O', 'is'),
 ('B-NAME_STUDENT', 'john'),
 ('I-NAME_STUDENT', 'smith'),
 ('O', 'and'),
 ('O', 'my'),
 ('O', 'email'),
 ('O', 'is'),
 ('B-NAME_STUDENT', 'john'),
 ('B-EMAIL', '.'),
 ('B-EMAIL', 'smith'),
 ('B-EMAIL', '@'),
 ('B-EMAIL', 'gma'),
 ('B-EMAIL', '##il'),
 ('B-EMAIL', '.'),
 ('B-EMAIL', 'com'),
 ('O', '.'),
 ('O', 'i'),
 ('O', 'live'),
 ('O', 'in'),
 ('O', '123'),
 ('O', '##4'),
 ('O', 'elm'),
 ('O', 'street'),
 ('O', '.'),
 ('O', 'my'),
 ('O', 'phone'),
 ('O', 'number'),
 ('O', 'is'),
 ('B-ID_NUM', '123'),
 ('I-PHONE_NUM', '-'),
 ('B-ID_NUM', '45'),
 ('B-ID_NUM', '##6'),
 ('I-PHONE_NUM', '-'),
 ('B-ID_NUM', '78'),
 ('B-ID_NUM', '##90'),
 ('O', '.'),
 ('O', 'my'),
 ('O', 'user'),
 ('O', '##name'),
 ('O', 'is'),
 ('O', 'johns'),
 ('B-EMAIL', '##mith'),
 ('B-ID_NUM', '##12'),
 ('B-ID_NUM', '##3'),
 ('O', '.'),
 ('O', 'my'),
 ('O', 'id'),
 ('O', 'number'),
 ('O', 'is'),
 ('B-ID_NUM', '123'),
 ('B-ID_NUM', '##45'),
 ('B-ID_NUM', '##6