### This notebook contains the exploration and initial code for the baseline model. The corrected and ready-to-use version will be in src/baseline.py

In [136]:
import torch 
from transformers import BertTokenizer, BertForTokenClassification
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

In [137]:
model_name = 'bert-base-uncased'
data_path = os.path.join('..','data', 'raw', 'train.json')

In [138]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

In [139]:
def tokenize_and_preserve_labels(text, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(text, text_labels):
        #tokenizes the word using BERT's subword tokenizer
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        #adds the same label to all the subwords of the word
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [140]:
sen = "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla"
labels = ["O", "O", "O", "O", "O", "O", "O", "O", "O", "B-NAME_STUDENT","I-NAME_STUDENT"]
print(len(sen.split()))
print(len(labels))
sen_tok= tokenize_and_preserve_labels(sen, labels)
print(sen_tok)

7
11
(['d', 'e', 's', 'i', 'g', 'n', '[MASK]', 't', 'h', 'i', 'n'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT', 'I-NAME_STUDENT'])


In [141]:
data = pd.read_json(data_path)

In [142]:
data[['tokens','labels']]

Unnamed: 0,tokens,labels
0,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,"[Design, Thinking, for, Innovation, \n\n, Sind...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,"[Assignment, :, , Visualization, , Reflecti...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."
...,...,...
6802,"[EXAMPLE, –, JOURNEY, MAP, \n\n, THE, CHALLENG...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6803,"[Why, Mind, Mapping, ?, \n\n, Mind, maps, are,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6804,"[Challenge, \n\n, So, ,, a, few, months, back,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6805,"[Brainstorming, \n\n, Challenge, &, Selection,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [143]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(data['tokens'].head(100), data['labels'])]
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [144]:
print(tokenized_texts_and_labels[0][0][:15],tokenized_texts_and_labels[0][1][:15])
print(len(tokenized_texts_and_labels[0][0]),len(tokenized_texts_and_labels[0][1]))

['design', 'thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal', '##ie', 'sy'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'I-NAME_STUDENT']
809 809


In [168]:
def chunker(tokens, labels, max_len=512, pad_text = 0.0,pad_labels="PAD"):
    """
    goal: chunk a text and encode it using a tokenizer
    takens in:
        tokens: list of tokens 
        labels: list of labels
        max_len: int
        pad_text: any type, by default: 0.0
        pad_labels: any type, by default: "PAD" 
    outputs:
        chunked_tokens: list of chunked tokens of the text
        chunked_labels: list of chunked labels of the text
    """
    
    assert len(tokens)==len(labels)
    chunked_tokens = []
    chunked_labels = []
    for pos in range(0,len(tokens),max_len):
        pad_length = max_len - (len(tokens) % max_len)
        tokens_chunk = tokens[pos:pos+max_len]
        labels_chunk = labels[pos:pos+max_len]
        if len(tokens_chunk) != 512:
            tokens_chunk.extend(pad_length * [pad_text])
            labels_chunk.extend(pad_length * [pad_labels])
        chunked_tokens.append(tokens_chunk)
        chunked_labels.append(labels_chunk)
    return chunked_tokens,chunked_labels

def chunk_text_and_labels(text_and_labels):
    """
    goal: chunk a corpus of texts and encode it using a tokenizer
    Takes in:
        text_and_labels: list of tuples (tokens,labels)
    outputs:
        chunked_tokens: list of chunked tokens of all texts
        chunked_labels: list of chunked labels of all texts
    """
    all_chunked_tokens, all_chunked_labels = [],[]
    for tokens, labels in text_and_labels:
        chunked_tokens,chunked_labels = chunker(tokens,labels)
        all_chunked_tokens.extend(chunked_tokens)
        all_chunked_labels.extend(chunked_labels)
    return all_chunked_tokens,all_chunked_labels

In [169]:
chunked_tokens,chunked_labels=chunk_text_and_labels(tokenized_texts_and_labels)

In [171]:
print([len(chunk) for chunk in chunked_tokens])

[512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,

In [148]:
# getting the unique labels and adding PAD for padding
label_names = set([label for row in labels for label in row])
label_names.add('PAD')

MAX_LEN = max(map(len, tokenized_texts))
tag2idx = {t: i for i, t in enumerate(label_names)}
print(label_names)
print(tag2idx)
print(MAX_LEN)

{'B-ID_NUM', 'B-NAME_STUDENT', 'PAD', 'B-EMAIL', 'B-URL_PERSONAL', 'O', 'I-NAME_STUDENT'}
{'B-ID_NUM': 0, 'B-NAME_STUDENT': 1, 'PAD': 2, 'B-EMAIL': 3, 'B-URL_PERSONAL': 4, 'O': 5, 'I-NAME_STUDENT': 6}
2122


Encoding