# This notebook contains the unfinished, work-in-progress methods. To access the working, current evolution of the baseline check out baseline.ipynb

Libraries

In [1]:
import torch 
from transformers import AutoTokenizer, AutoModelForTokenClassification
import os
import pandas as pd
import tensorflow as tf
from datasets import Dataset
from functools import reduce


Getting the model

In [3]:
MODEL_NAME = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
data_path = os.path.join('..','data', 'raw', 'train.json')

data = pd.read_json(data_path)

data_dataset = Dataset.from_pandas(data)
#data_dataset = data_dataset.remove_columns(['full_text','trailing_whitespace', 'document'])

In [5]:
tokenized = tokenizer(data_dataset['full_text'], padding=True, truncation=True, return_overflowing_tokens=True, return_tensors='pt')

In [19]:
tokenizer.decode(tokenized['input_ids'][1])

'[CLS]ctions. this second workshop also lasts two hours and allows the mind map to evolve. once familiarized with it, the stakeholders discover the power of the tool. then, the second workshop brings out even more ideas and constructive exchanges between the stakeholders. around this new mind map, they have learned to work together and want to make visible the untold ideas. i now present all the projects i manage in this type of format in order to ease rapid understanding for decision - makers. these presentations are the core of my business models. the decision - makers are thus able to identify the opportunities of the projects and can take quick decisions to validate them. they find answers to their questions thank to a schematic representation. approach what i find amazing with the facilitation of this type of workshop is the participants commitment for the project. this tool helps to give meaning. the participants appropriate the story and want to keep writing it. then, they easil

In [6]:
# tokenized.word_ids(0)

In [7]:
LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']

label2id = {label: i for i, label in enumerate(LABELS_LIST)}
label2id['[PAD]'] = -100
id2label = {i: label for label, i in label2id.items()}
id2label

{0: 'B-NAME_STUDENT',
 1: 'B-EMAIL',
 2: 'B-USERNAME',
 3: 'B-ID_NUM',
 4: 'B-PHONE_NUM',
 5: 'B-URL_PERSONAL',
 6: 'B-STREET_ADDRESS',
 7: 'I-NAME_STUDENT',
 8: 'I-EMAIL',
 9: 'I-USERNAME',
 10: 'I-ID_NUM',
 11: 'I-PHONE_NUM',
 12: 'I-URL_PERSONAL',
 13: 'I-STREET_ADDRESS',
 14: 'O',
 -100: '[PAD]'}

## Preprocessing - version 1

### Label encodings

In [8]:
def encode_labels(example):
    """
    to be used with datasets.map() with batched=False
    
    Encodes the labels into integers.
    
    """
    labels = example['labels']
    encoded = [label2id[label] for label in labels]
    return {'labels': encoded}

In [9]:
data_labels_encoded = data_dataset.map(encode_labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

### Encoding tokens

In [10]:
data_labels_encoded

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'],
    num_rows: 6807
})

In [19]:
example = data_labels_encoded[0]
tokenized_inputs = tokenizer(example['tokens'], is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=True, stride=0, return_tensors='pt')
tokenized_inputs['input_ids'].shape


SyntaxError: keyword argument repeated: max_length (815271139.py, line 2)

In [11]:
def tokenize_and_align(example, overlap_size = 0):
    """
    To be used with datasets.map() with batched=False

    Takes in 
        - example : an example from the datasets class
        - overlap_size: the number of tokens that overlap between two consecutive chunks
        
    outputs:
        - a Dict[]->List with columns:
            - of the bert tokenizer output
            - encoded labels
    """

    org_labels = example['labels']
    tokenized_inputs = tokenizer(example['tokens'], is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=True, stride=overlap_size, return_tensors='pt')
    tokenized_inputs.pop('overflow_to_sample_mapping')
    tokenized_inputs.pop('offset_mapping')
    
    new_labels = []
    org_word_ids_list = []
    document_id = []
    
    #iterating over chunks
    for i, chunk in enumerate(tokenized_inputs['input_ids']):
        ids_of_tokens = tokenized_inputs.word_ids(i)
        
        org_word_ids_list.append(ids_of_tokens)
        document_id.append(example['document'])
        #iterating over ids of tokens
        chunk_labels = []
        for id in ids_of_tokens:
            #if id=None, then it means it's some BERT token (CLS, SEP or PAD)
            if id is None:
                chunk_labels.append(-100)
            else:
                chunk_labels.append(org_labels[id])
        new_labels.append(chunk_labels)

    tokenized_inputs['labels'] = new_labels
    tokenized_inputs['org_word_ids'] = org_word_ids_list
    tokenized_inputs['document'] = document_id

    return tokenized_inputs
    
data_small = data_labels_encoded.select(range(1))
data_small = data_small.map(tokenize_and_align, batched=False)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [72]:
data_encoded_all = data_labels_encoded.map(tokenize_and_align, batched=False)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [59]:
data_encoded_all

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'new_labels'],
    num_rows: 6807
})

In [73]:
data_encoded_all['document'][0]

[7, 7]

In [74]:
def flatten_data(data, keys_to_flatten):

    data_flat = {}

    for key in keys_to_flatten:
        data_flat[key] = reduce(lambda x,y: x+y, data[key])


    return Dataset.from_dict(data_flat)

keys_to_flatten = ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids','document']


data_flat = flatten_data(data_encoded_all, keys_to_flatten)

data_flat

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids', 'document'],
    num_rows: 12812
})

In [None]:
data

In [42]:
data_small['input_ids'][0][0][10:15], data_small['new_labels'][0][0][10:15], tokenizer.decode(data_small['input_ids'][0][0][10:15])

([25682, 1011, 14085, 8865, 2666], [14, 14, 0, 0, 0], '2021 - nathalie')

In [49]:
len(data_small['input_ids'][0][1]), len(data_small['new_labels'][0][1])

(512, 512)

In [None]:
flat_labels = reduce(lambda x,y: x+y, data_small['labels'])

In [None]:
data_chunked_encoded = data_labels_encoded.map(lambda x: tokenize_and_align(x,sub_word_labeling=True), batched=False)
#time to tokenize and chunk WITH subword labeling : 1m58s

In [None]:
print("labels aligned with tokens, with subword labeling")
print(tokenizer.convert_ids_to_tokens(data_chunked_encoded['input_ids'][0][0][2:14]))
print(data_chunked_encoded['labels'][0][0][2:14])

labels aligned with tokens, with subword labeling
['thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal']
[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0]


In [None]:
data_chunked_encoded_sub_labeling_false = data_labels_encoded.map(lambda x: tokenize_and_align(x,sub_word_labeling=False), batched=False)
#time to tokenize and chunk WITHOUT subword labeling : 46s

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [None]:
print("labels aligned with tokens, without subword labeling")
print(tokenizer.convert_ids_to_tokens(data_chunked_encoded_sub_labeling_false['input_ids'][0][0][2:14]))
print(data_chunked_encoded_sub_labeling_false['labels'][0][0][2:14])

labels aligned with tokens, without subword labeling
['thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal']
[14, 14, 14, 14, -100, 14, 14, -100, 14, 14, 0, -100]


### Flattening the rows

In [None]:

#flatten the dataset
data_flat = {}

keys = ['labels', 'input_ids', 'token_type_ids', 'attention_mask']

for key in keys:
    data_flat[key] = reduce(lambda x,y: x+y, data_chunked_encoded[key])

data_flat = Dataset.from_dict(data_flat)

In [None]:
#checking if each row has length 512 for each column
print("Checking if each row has length 512 for each column")
for example in data_flat:
    for key in keys:
        if len(example[key]) != 512:
            print(f"Error in {key}, length is {len(example[key])} instead of 512")
            break
print("All good")

Checking if each row has length 512 for each column
All good


In [70]:
len(data_flat['input_ids'][0])

512

## Preprocessing - version 0

In [10]:
def tokenize_and_preserve_labels(examples):
    labels = []
    tokenized_sentence = []
    for word, label in zip(examples['tokens'], examples['labels']):
        #tokenizes the word using BERT's subword tokenizer
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        #adds the same label to all the subwords of the word
        labels.extend([label] * n_subwords)
    examples['tokens'] = tokenized_sentence
    examples['labels'] = labels
    return examples

In [11]:
tokenized_data = data_dataset.map(tokenize_and_preserve_labels,batched=False)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [12]:
def chunk(examples, block_size=510, sliding_window=False):
    tokenized_sentences = []
    labels = []
    for i in range(0, len(examples['tokens']), block_size):
        chunk_token = examples['tokens'][i:i+block_size]
        chunk_label = examples['labels'][i:i+block_size]
        if len(chunk_token) < block_size:
            chunk_token += ['[PAD]'] * (block_size - len(chunk_token))
            chunk_label += ['[PAD]'] * (block_size - len(chunk_label))
        tokenized_sentences.append(chunk_token)
        labels.append(chunk_label)
    return {'tokens': tokenized_sentences, 'labels': labels}

In [13]:
chunked_data = tokenized_data.map(chunk, batched=False)
chunked_data = chunked_data.remove_columns(['document', 'full_text','trailing_whitespace'])

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [14]:
from functools import reduce
flat_chunks_tokens = list(reduce(lambda x, y: x + y, chunked_data['tokens'], []))
flat_chunks_labels = list(reduce(lambda x, y: x + y, chunked_data['labels'], []))

In [15]:
chunked_flattened_data = Dataset.from_dict({'tokens': flat_chunks_tokens, 'labels': flat_chunks_labels})

Setting the '[PAD]' label to be encoded as -100 so that it's ignored by the loss function

In [16]:
chunked_flattened_data

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 12812
})

In [17]:
def encode_tokens(example):
    detokenized = list(map(lambda x: ' '.join(x), example['tokens']))
    detokenized = list(map(lambda x: x.replace(' ##', ''), detokenized))
    encoded = tokenizer(detokenized, truncation=True, is_split_into_words = False, return_tensors='pt')
    return encoded

In [18]:
encoded_data = chunked_flattened_data.map(encode_tokens, batched=True)

Map:   0%|          | 0/12812 [00:00<?, ? examples/s]

In [19]:
def encode_labels(example):
    labels = example['labels']
    #adding -100 for the [CLS] token and [SEP] token
    encoded = [-100] + [label2id[label] for label in labels] + [-100]
    return {'labels': encoded}

encoded_labels = encoded_data.map(encode_labels, batched=False)

Map:   0%|          | 0/12812 [00:00<?, ? examples/s]

In [20]:
encoded_data

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12812
})

In [21]:
len(encoded_labels['tokens'][0]), len(encoded_labels['labels'][0]), len(encoded_labels['input_ids'][0])

(510, 512, 512)

In [22]:
encoded_data_split = encoded_labels.train_test_split(test_size=0.1)

In [23]:
tokenizer.decode(encoded_labels['input_ids'][0][11:20])

'- nathalie sylla challenge & selection'

In [144]:
encoded_data_split

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11530
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1282
    })
})

## Model training & evaluation

In [170]:
#encoded_data_split = data_flat.train_test_split(test_size=0.1)

In [146]:
import evaluate

seqeval = evaluate.load("seqeval")

In [147]:
import numpy as np


def compute_metrics(p):
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [LABELS_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABELS_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [94]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [148]:
#freezing the BERT layers
for param in model.base_model.parameters():
    param.requires_grad = False

In [149]:
# training_args = TrainingArguments(
#     output_dir="model_baseline_v0.1",
#     learning_rate=2e-3,
#     per_device_train_batch_size=20,
#     per_device_eval_batch_size=20,
#     num_train_epochs=2,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )


target_dir = "model/trainer_model_initial_preprocessing"

training_args = TrainingArguments(output_dir=target_dir, evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data_split["train"],
    eval_dataset=encoded_data_split["test"],
    #tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/4326 [00:00<?, ?it/s]

{'loss': 0.0005, 'learning_rate': 4.4220989366620436e-05, 'epoch': 0.35}
{'loss': 0.0005, 'learning_rate': 3.844197873324087e-05, 'epoch': 0.69}


  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.00029877974884584546, 'eval_precision': 0.9052132701421801, 'eval_recall': 0.9646464646464646, 'eval_f1': 0.9339853300733496, 'eval_accuracy': 0.9998955729182983, 'eval_runtime': 53.9588, 'eval_samples_per_second': 23.759, 'eval_steps_per_second': 2.984, 'epoch': 1.0}
{'loss': 0.0006, 'learning_rate': 3.266296809986131e-05, 'epoch': 1.04}
{'loss': 0.0006, 'learning_rate': 2.688395746648174e-05, 'epoch': 1.39}
{'loss': 0.0004, 'learning_rate': 2.1104946833102173e-05, 'epoch': 1.73}


  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.0002911491028498858, 'eval_precision': 0.9073634204275535, 'eval_recall': 0.9646464646464646, 'eval_f1': 0.9351285189718482, 'eval_accuracy': 0.9998976614599324, 'eval_runtime': 53.6274, 'eval_samples_per_second': 23.906, 'eval_steps_per_second': 3.002, 'epoch': 2.0}
{'loss': 0.0005, 'learning_rate': 1.5325936199722607e-05, 'epoch': 2.08}
{'loss': 0.0005, 'learning_rate': 9.546925566343042e-06, 'epoch': 2.43}
{'loss': 0.0005, 'learning_rate': 3.7679149329634766e-06, 'epoch': 2.77}


  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.0002923872380051762, 'eval_precision': 0.9073634204275535, 'eval_recall': 0.9646464646464646, 'eval_f1': 0.9351285189718482, 'eval_accuracy': 0.9998976614599324, 'eval_runtime': 53.2128, 'eval_samples_per_second': 24.092, 'eval_steps_per_second': 3.026, 'epoch': 3.0}
{'train_runtime': 1946.9332, 'train_samples_per_second': 17.766, 'train_steps_per_second': 2.222, 'train_loss': 0.0004889280520210759, 'epoch': 3.0}


TrainOutput(global_step=4326, training_loss=0.0004889280520210759, metrics={'train_runtime': 1946.9332, 'train_samples_per_second': 17.766, 'train_steps_per_second': 2.222, 'train_loss': 0.0004889280520210759, 'epoch': 3.0})

In [150]:
trainer.save_model('model/model_initial_preprocessing')

In [183]:
#put trainer on cpu

trainer.model.to('mps')

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [184]:
trainer.evaluate()

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.0002923872380051762,
 'eval_precision': 0.9073634204275535,
 'eval_recall': 0.9646464646464646,
 'eval_f1': 0.9351285189718482,
 'eval_accuracy': 0.9998976614599324,
 'eval_runtime': 54.1181,
 'eval_samples_per_second': 23.689,
 'eval_steps_per_second': 2.975,
 'epoch': 3.0}

In [162]:
model = trainer.model

In [171]:
sen = 'my name is John Smith and my email is john.smith@gmail.com.'

model = model.to('cpu')

predictions = model(**tokenizer(sen, return_tensors='pt'))
predictions = np.argmax(predictions.logits.detach().numpy(), axis=2)

In [172]:
bert_tokens = tokenizer.tokenize(sen)
bert_tokens = ['[CLS]'] + bert_tokens + ['[SEP]']

In [176]:
predictions

array([[14, 14, 14, 14,  0,  7, 14, 14, 14, 14,  0,  1,  1,  1,  1,  1,
         1,  1, 14, 14]])

In [175]:
[(id2label[pred], token) for pred, token in zip(predictions[0], bert_tokens)]

[('O', '[CLS]'),
 ('O', 'my'),
 ('O', 'name'),
 ('O', 'is'),
 ('B-NAME_STUDENT', 'john'),
 ('I-NAME_STUDENT', 'smith'),
 ('O', 'and'),
 ('O', 'my'),
 ('O', 'email'),
 ('O', 'is'),
 ('B-NAME_STUDENT', 'john'),
 ('B-EMAIL', '.'),
 ('B-EMAIL', 'smith'),
 ('B-EMAIL', '@'),
 ('B-EMAIL', 'gma'),
 ('B-EMAIL', '##il'),
 ('B-EMAIL', '.'),
 ('B-EMAIL', 'com'),
 ('O', '.'),
 ('O', '[SEP]')]

In [177]:
#load the model into a pipeline and evaluaye it on the test set

from transformers import TokenClassificationPipeline

model_loaded = AutoModelForTokenClassification.from_pretrained('model/model_initial_preprocessing')
model_loaded = model.to('cpu')

In [180]:
essay = """
'The consequences of Great Planning', by John Smith

The consequences of great planning are many. One of the most important consequences is that it allows us to achieve our goals. When we plan, we are able to think about what we want to achieve and how we are going to achieve it. This helps us to focus on what is important and to avoid wasting time on things that are not important. Planning also helps us to see the big picture and to understand how all the different parts of a project fit together. This can help us to make better decisions and to avoid making mistakes.

Another consequence of great planning is that it helps us to be more efficient. When we plan, we can break a big task down into smaller tasks and then work on each task one at a time. This can help us to stay focused and to avoid feeling overwhelmed. Planning can also help us to identify potential problems before they occur and to come up with solutions to those problems. This can help us to save time and to avoid wasting resources.

Finally, great planning can help us to be more successful. When we plan, we are able to set clear goals and to create a roadmap for how we are going to achieve those goals. This can help us to stay motivated and to keep moving forward, even when things get tough. Planning can also help us to track our progress and to see how far we have come. This can help us to stay focused and to keep working towards our goals, even when we face obstacles.

In conclusion, the consequences of great planning are many. Planning can help us to achieve our goals, to be more efficient, and to be more successful. It can help us to focus on what is important, to avoid wasting time, and to make better decisions. It can help us to stay motivated, to track our progress, and to keep moving forward. In short, great planning is essential for success in any endeavor.

email: john.smith@bmail.com
phone: 123-456-7890

"""

predictions = model(**tokenizer(essay, return_tensors='pt'))
predictions = np.argmax(predictions.logits.detach().numpy(), axis=2)

bert_tokens = tokenizer.tokenize(essay)
bert_tokens = ['[CLS]'] + bert_tokens + ['[SEP]']

[(id2label[pred], token) for pred, token in zip(predictions[0], bert_tokens)]

[('O', '[CLS]'),
 ('O', "'"),
 ('O', 'the'),
 ('O', 'consequences'),
 ('O', 'of'),
 ('O', 'great'),
 ('O', 'planning'),
 ('O', "'"),
 ('O', ','),
 ('O', 'by'),
 ('B-NAME_STUDENT', 'john'),
 ('I-NAME_STUDENT', 'smith'),
 ('O', 'the'),
 ('O', 'consequences'),
 ('O', 'of'),
 ('O', 'great'),
 ('O', 'planning'),
 ('O', 'are'),
 ('O', 'many'),
 ('O', '.'),
 ('O', 'one'),
 ('O', 'of'),
 ('O', 'the'),
 ('O', 'most'),
 ('O', 'important'),
 ('O', 'consequences'),
 ('O', 'is'),
 ('O', 'that'),
 ('O', 'it'),
 ('O', 'allows'),
 ('O', 'us'),
 ('O', 'to'),
 ('O', 'achieve'),
 ('O', 'our'),
 ('O', 'goals'),
 ('O', '.'),
 ('O', 'when'),
 ('O', 'we'),
 ('O', 'plan'),
 ('O', ','),
 ('O', 'we'),
 ('O', 'are'),
 ('O', 'able'),
 ('O', 'to'),
 ('O', 'think'),
 ('O', 'about'),
 ('O', 'what'),
 ('O', 'we'),
 ('O', 'want'),
 ('O', 'to'),
 ('O', 'achieve'),
 ('O', 'and'),
 ('O', 'how'),
 ('O', 'we'),
 ('O', 'are'),
 ('O', 'going'),
 ('O', 'to'),
 ('O', 'achieve'),
 ('O', 'it'),
 ('O', '.'),
 ('O', 'this'),
 ('O'

In [119]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data_split["train"],
    eval_dataset=encoded_data_split["test"],
    compute_metrics=compute_metrics,
)

trainer.evaluate()

  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.010430662892758846,
 'eval_precision': 0.10179640718562874,
 'eval_recall': 0.22869955156950672,
 'eval_f1': 0.1408839779005525,
 'eval_accuracy': 0.9983660777738643,
 'eval_runtime': 56.1491,
 'eval_samples_per_second': 22.832,
 'eval_steps_per_second': 2.867}

In [185]:
data_flat['labels'][0]

[-100,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 0,
 0,
 0,
 7,
 7,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14

In [189]:
len(encoded_labels['labels'][0])

512

In [194]:
for i, (l1,l2) in enumerate(zip(data_flat['labels'][0], encoded_labels['labels'][0])):
    if l1 != l2:
        print(i, l1, l2)

print(tokenizer.convert_ids_to_tokens(encoded_labels['input_ids'][0][474:479]))
print(tokenizer.convert_ids_to_tokens(data_flat['input_ids'][0][474:479]))

data_flat['input_ids'][0] == encoded_labels['input_ids'][0]

474 14 0
475 14 0
476 14 0
477 14 7
478 14 7
['nat', '##hal', '##ie', 'sy', '##lla']
['nat', '##hal', '##ie', 'sy', '##lla']


True

In [190]:
len(data_flat['labels'][0])

512