### This notebook contains the exploration and initial code for the baseline model. The corrected and ready-to-use version will be in src/baseline.py

In [1]:
import torch 
from transformers import AdamW, AutoTokenizer, AutoModelForTokenClassification
import os
import pandas as pd
import tensorflow as tf
from datasets import Dataset

In [2]:
model_name = 'bert-base-uncased'
data_path = os.path.join('..','data', 'raw', 'train.json')

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=15)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [92]:
def tokenize_and_preserve_labels(text, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(text, text_labels):
        #tokenizes the word using BERT's subword tokenizer
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        #adds the same label to all the subwords of the word
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [131]:
sen = 'Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla'.split(" ")
labels = ["O", "O", "O", "O", "O", "O", "O", "O", "O", "B-NAME_STUDENT","I-NAME_STUDENT"]
print(len(labels))
sen_tok= tokenize_and_preserve_labels(sen, labels)
print(sen_tok)

11
(['design', 'thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal', '##ie', 'sy', '##lla'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])


In [133]:
tokenizer(sen_tok[0], is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")

{'input_ids': tensor([[  101,  2640,  3241,  2005,  8144, 22259,  1001,  1001, 10163,  1011,
         20704,  1001,  1001, 15544,  2140, 25682,  1011, 14085,  1001,  1001,
         11085,  1001,  1001, 29464, 25353,  1001,  1001,  2222,  2050,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}

In [94]:
data = pd.read_json(data_path)

In [112]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(data['tokens'].head(1000), data['labels'])]
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [97]:
print(tokenized_texts_and_labels[0][0][:15],tokenized_texts_and_labels[0][1][:15])
print(len(tokenized_texts_and_labels[0][0]),len(tokenized_texts_and_labels[0][1]))

['design', 'thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal', '##ie', 'sy'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'I-NAME_STUDENT']
746 746


In [121]:
len(tokenized_texts_and_labels[0][0][0][0])

1

In [129]:
[sen[0] for sen in tokenized_texts_and_labels[:10]][0]

['design',
 'thinking',
 'for',
 'innovation',
 'reflex',
 '##ion',
 '-',
 'av',
 '##ril',
 '2021',
 '-',
 'nat',
 '##hal',
 '##ie',
 'sy',
 '##lla',
 'challenge',
 '&',
 'selection',
 'the',
 'tool',
 'i',
 'use',
 'to',
 'help',
 'all',
 'stakeholders',
 'finding',
 'their',
 'way',
 'through',
 'the',
 'complexity',
 'of',
 'a',
 'project',
 'is',
 'the',
 'mind',
 'map',
 '.',
 'what',
 'exactly',
 'is',
 'a',
 'mind',
 'map',
 '?',
 'according',
 'to',
 'the',
 'definition',
 'of',
 'bu',
 '##zan',
 't',
 '.',
 'and',
 'bu',
 '##zan',
 'b',
 '.',
 '(',
 '1999',
 ',',
 'des',
 '##sin',
 '##e',
 '-',
 'moi',
 'l',
 "'",
 'intelligence',
 '.',
 'paris',
 ':',
 'les',
 'editions',
 'd',
 "'",
 'organisation',
 '.',
 ')',
 ',',
 'the',
 'mind',
 'map',
 '(',
 'or',
 'he',
 '##uri',
 '##stic',
 'diagram',
 ')',
 'is',
 'a',
 'graphic',
 'representation',
 'technique',
 'that',
 'follows',
 'the',
 'natural',
 'functioning',
 'of',
 'the',
 'mind',
 'and',
 'allows',
 'the',
 'brain',
 "

In [103]:
def chunker(tokens, labels, max_len=512, pad_text = "[PAD]",pad_labels="[PAD]"):
    """
    goal: chunk a text and encode it using a tokenizer
    takens in:
        tokens: list of tokens 
        labels: list of labels
        max_len: int
        pad_text: any type, by default: "[PAD]"
        pad_labels: any type, by default: "[PAD]" 
    outputs:
        chunked_tokens: list of chunked tokens of the text
        chunked_labels: list of chunked labels of the text
    """
    
    assert len(tokens)==len(labels)
    chunked_tokens = []
    chunked_labels = []
    for pos in range(0,len(tokens),max_len):
        pad_length = max_len - (len(tokens) % max_len)
        tokens_chunk = tokens[pos:pos+max_len]
        labels_chunk = labels[pos:pos+max_len]
        if len(tokens_chunk) != 512:
            tokens_chunk.extend(pad_length * [pad_text])
            labels_chunk.extend(pad_length * [pad_labels])
        chunked_tokens.append(tokens_chunk)
        chunked_labels.append(labels_chunk)
    return chunked_tokens,chunked_labels

def chunk_text_and_labels(text_and_labels):
    """
    goal: chunk a corpus of texts and encode it using a tokenizer
    Takes in:
        text_and_labels: list of tuples (tokens,labels)
    outputs:
        chunked_tokens: list of chunked tokens of all texts
        chunked_labels: list of chunked labels of all texts
    """
    all_chunked_tokens, all_chunked_labels = [],[]
    for tokens, labels in text_and_labels:
        chunked_tokens,chunked_labels = chunker(tokens,labels)
        all_chunked_tokens.extend(chunked_tokens)
        all_chunked_labels.extend(chunked_labels)
    return all_chunked_tokens,all_chunked_labels

In [104]:
chunked_tokens,chunked_labels=chunk_text_and_labels(tokenized_texts_and_labels)

In [106]:
dataset_chunked = Dataset.from_dict({"tokens":chunked_tokens,"tags":chunked_labels})

In [107]:
dataset_chunked

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 204
})

In [None]:
def encode_dataset(dataset):
    """
    goal: encode a dataset using a tokenizer
    Takes in:
        dataset: dataset
    outputs:
        dataset: dataset
    """
    encoded = tokenizer(dataset['tokens'], is_split_into_words=True, padding=True, truncation=True, return_tensors='pt')
    encoded['labels'] = dataset['tags']
    return encoded

In [160]:
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT']

In [158]:
# getting the unique labels and adding PAD for padding
label_names = set([label for row in labels for label in row])
label_names.add('[PAD]')

MAX_LEN = max(map(len, tokenized_texts))
tag2idx = {t: i+1 for i, t in enumerate(label_names)}
tag2idx['PAD'] = 0

#idx2tag = {i: t for t, i in tag2idx.items()}

print(label_names)
print(tag2idx)
print(MAX_LEN)

{'T', 'E', 'U', 'A', 'D', '[PAD]', 'N', 'M', 'I', 'O', '_', 'S', '-', 'B'}
{'T': 1, 'E': 2, 'U': 3, 'A': 4, 'D': 5, '[PAD]': 6, 'N': 7, 'M': 8, 'I': 9, 'O': 10, '_': 11, 'S': 12, '-': 13, 'B': 14, 'PAD': 0}
2833


Encoding the dataset

In [16]:
chunked_tokens[0]

['design',
 'thinking',
 'for',
 'innovation',
 'reflex',
 '##ion',
 '-',
 'av',
 '##ril',
 '2021',
 '-',
 'nat',
 '##hal',
 '##ie',
 'sy',
 '##lla',
 'challenge',
 '&',
 'selection',
 'the',
 'tool',
 'i',
 'use',
 'to',
 'help',
 'all',
 'stakeholders',
 'finding',
 'their',
 'way',
 'through',
 'the',
 'complexity',
 'of',
 'a',
 'project',
 'is',
 'the',
 'mind',
 'map',
 '.',
 'what',
 'exactly',
 'is',
 'a',
 'mind',
 'map',
 '?',
 'according',
 'to',
 'the',
 'definition',
 'of',
 'bu',
 '##zan',
 't',
 '.',
 'and',
 'bu',
 '##zan',
 'b',
 '.',
 '(',
 '1999',
 ',',
 'des',
 '##sin',
 '##e',
 '-',
 'moi',
 'l',
 "'",
 'intelligence',
 '.',
 'paris',
 ':',
 'les',
 'editions',
 'd',
 "'",
 'organisation',
 '.',
 ')',
 ',',
 'the',
 'mind',
 'map',
 '(',
 'or',
 'he',
 '##uri',
 '##stic',
 'diagram',
 ')',
 'is',
 'a',
 'graphic',
 'representation',
 'technique',
 'that',
 'follows',
 'the',
 'natural',
 'functioning',
 'of',
 'the',
 'mind',
 'and',
 'allows',
 'the',
 'brain',
 "

In [17]:
contains_float = any(isinstance(token, float) for chunks in chunked_tokens for token in chunks)
contains_float

False

In [18]:
encoded_labels = [[tag2idx[l] for l in lab] for lab in chunked_labels]
encoded_texts = [tokenizer.convert_tokens_to_ids(chunk) for chunk in chunked_tokens]

In [19]:
encoded_texts

[[2640,
  3241,
  2005,
  8144,
  22259,
  3258,
  1011,
  20704,
  15928,
  25682,
  1011,
  14085,
  8865,
  2666,
  25353,
  4571,
  4119,
  1004,
  4989,
  1996,
  6994,
  1045,
  2224,
  2000,
  2393,
  2035,
  22859,
  4531,
  2037,
  2126,
  2083,
  1996,
  11619,
  1997,
  1037,
  2622,
  2003,
  1996,
  2568,
  4949,
  1012,
  2054,
  3599,
  2003,
  1037,
  2568,
  4949,
  1029,
  2429,
  2000,
  1996,
  6210,
  1997,
  20934,
  13471,
  1056,
  1012,
  1998,
  20934,
  13471,
  1038,
  1012,
  1006,
  2639,
  1010,
  4078,
  11493,
  2063,
  1011,
  25175,
  1048,
  1005,
  4454,
  1012,
  3000,
  1024,
  4649,
  6572,
  1040,
  1005,
  5502,
  1012,
  1007,
  1010,
  1996,
  2568,
  4949,
  1006,
  2030,
  2002,
  9496,
  10074,
  16403,
  1007,
  2003,
  1037,
  8425,
  6630,
  6028,
  2008,
  4076,
  1996,
  3019,
  12285,
  1997,
  1996,
  2568,
  1998,
  4473,
  1996,
  4167,
  1005,
  1055,
  4022,
  2000,
  2022,
  2207,
  1012,
  12935,
  17827,
  2487,
  2023,
  699

In [20]:
#print(dataset['labels'])

In [21]:
!pip install evaluate
!pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [7]:
from datasets import Dataset
data = pd.read_json(data_path)
dataset = Dataset.from_pandas(data)

In [8]:
# label_to_int = {'B-NAME_STUDENT':0, 'B-EMAIL':1, 'B-USERNAME':2, 'B-ID_NUM':3, 'B-PHONE_NUM':4, 'B-URL_PERSONAL':5, 'B-STREET_ADDRESS':6, 'I-NAME_STUDENT':7, 'I-EMAIL':8, 'I-USERNAME':9, 'I-ID_NUM':10, 'I-PHONE_NUM':11 ,'I-URL_PERSONAL':12,'I-STREET_ADDRESS':13, 'O':14}
# int_to_label = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
from datasets import Features, ClassLabel
features = Features({'label': ClassLabel(num_classes=15, names=['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O'])})

In [11]:
def tokenize_function(examples):
    result = tokenizer(examples["full_text"], padding="max_length",max_length=512, truncation=True, return_overflowing_tokens=True, stride=10, return_tensors='pt')
    sample_map = result.pop('overflow_to_sample_mapping')
    print(sample_map)
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tensor([  0,   0,   1,  ..., 998, 999, 999])
tensor([  0,   1,   2,  ..., 998, 999, 999])
tensor([  0,   0,   1,  ..., 998, 999, 999])
tensor([  0,   0,   1,  ..., 998, 999, 999])
tensor([  0,   0,   0,  ..., 998, 999, 999])
tensor([  0,   0,   1,  ..., 998, 998, 999])
tensor([  0,   0,   1,  ..., 805, 806, 806])


In [17]:
len(tokenized_dataset['labels'][2]), len(tokenized_dataset['input_ids'][0])

(563, 512)

In [25]:
dataset

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'],
    num_rows: 6807
})

In [26]:
# model_inputs = tokenizer(dataset['full_text'], return_overflowing_tokens=True, stride=10, return_tensors='pt')

In [27]:
# print(len(model_inputs))

In [28]:
# too_long = [ex for ex in dataset if len(ex['full_text'])>512]

In [29]:
# nathalie_sylla = too_long[0]
# nat_tok = tokenizer(nathalie_sylla['full_text'])
# # nat_tok2 = tokenizer(nathalie_sylla['tokens'])

# print(len(nathalie_sylla['full_text']), len(nathalie_sylla['tokens']),len(nat_tok),nat_tok)

In [30]:
# nat_tok3 = tokenizer(nathalie_sylla['full_text'], padding=True, truncation=True, max_length=512, return_overflowing_tokens=True, stride=10, return_tensors='pt')
# print(len(nat_tok['input_ids']), len(nat_tok3['input_ids'][0]),len(nat_tok3['input_ids'][1]))
# print(nat_tok3)
# overflow = nat_tok3.pop('overflow_to_sample_mapping')
# emb = model(**nat_tok3)

In [31]:
import numpy as np

In [32]:
tokenized_dataset.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11560
    })
    test: Dataset({
        features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1285
    })
})

In [33]:
tokenized_dataset_train_test = tokenized_dataset.train_test_split(test_size=0.1)

In [46]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        data = {'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_mask[idx],
                'labels': self.labels[idx]}
        return data

    def __len__(self):
        return len(self.input_ids)

In [47]:
training_data = CustomDataset(tokenized_dataset_train_test['train']['input_ids'], 
                              tokenized_dataset_train_test['train']['attention_mask'], 
                              tokenized_dataset_train_test['train']['token_type_ids'])
eval_data = CustomDataset(tokenized_dataset_train_test['test']['input_ids'],
                            tokenized_dataset_train_test['test']['attention_mask'],
                            tokenized_dataset_train_test['test']['token_type_ids'])

In [36]:
# dataset = np.array(tokenized_dataset['input_ids'])
# labels = np.array(encoded_labels)

In [41]:
train_test= tokenized_dataset.train_test_split(test_size=0.1)

In [48]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
metric = evaluate.load("accuracy")
training_args = TrainingArguments('test_trainer')
trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data, compute_metrics=compute_metrics)
trainer.train()
# learning rate scheduler = function that adapts the learning rate depending on the iteration

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/4335 [00:00<?, ?it/s]

{'loss': 0.0131, 'learning_rate': 4.423298731257209e-05, 'epoch': 0.35}
{'loss': 0.0001, 'learning_rate': 3.846597462514418e-05, 'epoch': 0.69}
{'loss': 0.0, 'learning_rate': 3.269896193771627e-05, 'epoch': 1.04}
{'loss': 0.0, 'learning_rate': 2.6931949250288352e-05, 'epoch': 1.38}
{'loss': 0.0, 'learning_rate': 2.116493656286044e-05, 'epoch': 1.73}
{'loss': 0.0, 'learning_rate': 1.5397923875432525e-05, 'epoch': 2.08}
{'loss': 0.0, 'learning_rate': 9.630911188004614e-06, 'epoch': 2.42}
{'loss': 0.0, 'learning_rate': 3.863898500576701e-06, 'epoch': 2.77}
{'train_runtime': 5051.3746, 'train_samples_per_second': 6.865, 'train_steps_per_second': 0.858, 'train_loss': 0.0015312985991339405, 'epoch': 3.0}


TrainOutput(global_step=4335, training_loss=0.0015312985991339405, metrics={'train_runtime': 5051.3746, 'train_samples_per_second': 6.865, 'train_steps_per_second': 0.858, 'train_loss': 0.0015312985991339405, 'epoch': 3.0})

model

In [50]:
model.save_pretrained('model_baseline')

In [134]:
print(tokenized_dataset)

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12845
})


In [142]:
input = tokenizer("Hello, this one sentence!", return_tensors="pt")
out = model(**input)

In [169]:
features['label'].names

['B-NAME_STUDENT',
 'B-EMAIL',
 'B-USERNAME',
 'B-ID_NUM',
 'B-PHONE_NUM',
 'B-URL_PERSONAL',
 'B-STREET_ADDRESS',
 'I-NAME_STUDENT',
 'I-EMAIL',
 'I-USERNAME',
 'I-ID_NUM',
 'I-PHONE_NUM',
 'I-URL_PERSONAL',
 'I-STREET_ADDRESS',
 'O']

In [176]:
out[0].shape
tags = np.argmax(out[0].detach().numpy(), axis=2)
print(tags[0])
tags = [features['label'].names[tag] for tag in tags[0]]
print(tags)

[14 14  3 10 10  4 11 12]
['O', 'O', 'B-ID_NUM', 'I-ID_NUM', 'I-ID_NUM', 'B-PHONE_NUM', 'I-PHONE_NUM', 'I-URL_PERSONAL']
