### This notebook contains the exploration and initial code for the baseline model. The corrected and ready-to-use version will be in src/baseline.py

In [35]:
import torch 
from transformers import BertTokenizer, BertForTokenClassification
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

In [36]:
model_name = 'bert-base-uncased'
data_path = os.path.join('..','data', 'raw', 'train.json')

In [18]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

  5%|▍         | 20975616/435779157 [02:18<52:27, 131781.58B/s]

In [53]:
def tokenize_and_preserve_labels(text, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(text, text_labels):
        #tokenizes the word using BERT's subword tokenizer
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        #adds the same label to all the subwords of the word
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [52]:
sen = "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla"
labels = ["O", "O", "O", "O", "O", "O", "O", "O", "O", "B-NAME_STUDENT","I-NAME_STUDENT"]
print(len(sen.split()))
print(len(labels))
sen_tok= tokenize_and_preserve_labels(sen, labels)
print(sen_tok)

7
11
(['design', 'thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal', '##ie', 'sy', '##lla'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])


In [54]:
data = pd.read_json(data_path)

In [55]:
data[['tokens','labels']]

Unnamed: 0,tokens,labels
0,"[Design, Thinking, for, innovation, reflexion,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,"[Design, Thinking, for, Innovation, \n\n, Sind...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,"[Assignment, :, , Visualization, , Reflecti...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."
...,...,...
6802,"[EXAMPLE, –, JOURNEY, MAP, \n\n, THE, CHALLENG...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6803,"[Why, Mind, Mapping, ?, \n\n, Mind, maps, are,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6804,"[Challenge, \n\n, So, ,, a, few, months, back,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6805,"[Brainstorming, \n\n, Challenge, &, Selection,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [56]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(data['tokens'].head(100), data['labels'])]
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [57]:
def chunker(tokens,labels,max_len=512):
    if len(tokens)<=max_len:
        return [(tokens,labels)]
    return [(tokens[pos:pos+max_len],labels[pos:pos+max_len]) for pos in range(0,len(tokens),max_len)]

def chunk_text_and_labels(text_and_labels):
    chunked_data = []
    for tokens, labels in text_and_labels:
        chunked_data.extend(chunker(tokens,labels))
    return chunked_data

In [58]:
chunked_data=chunk_text_and_labels(tokenized_texts_and_labels[:10])

In [59]:
print(tokenized_texts_and_labels[0][0][:15],tokenized_texts_and_labels[0][1][:15])
print(len(tokenized_texts_and_labels[0][0]),len(tokenized_texts_and_labels[0][1]))

['design', 'thinking', 'for', 'innovation', 'reflex', '##ion', '-', 'av', '##ril', '2021', '-', 'nat', '##hal', '##ie', 'sy'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'B-NAME_STUDENT', 'I-NAME_STUDENT']
809 809


KeyError: 'labels'

In [9]:
# getting the unique labels and adding PAD for padding
label_names = set([label for row in labels for label in row])
label_names.add('PAD')

MAX_LEN = max(map(len, tokenized_texts))
tag2idx = {t: i for i, t in enumerate(label_names)}
print(label_names)
print(tag2idx)
print(MAX_LEN)

{'B-STREET_ADDRESS', 'B-EMAIL', 'I-URL_PERSONAL', 'B-PHONE_NUM', 'B-ID_NUM', 'B-USERNAME', 'PAD', 'I-ID_NUM', 'I-NAME_STUDENT', 'B-NAME_STUDENT', 'O', 'I-STREET_ADDRESS', 'B-URL_PERSONAL', 'I-PHONE_NUM'}
{'B-STREET_ADDRESS': 0, 'B-EMAIL': 1, 'I-URL_PERSONAL': 2, 'B-PHONE_NUM': 3, 'B-ID_NUM': 4, 'B-USERNAME': 5, 'PAD': 6, 'I-ID_NUM': 7, 'I-NAME_STUDENT': 8, 'B-NAME_STUDENT': 9, 'O': 10, 'I-STREET_ADDRESS': 11, 'B-URL_PERSONAL': 12, 'I-PHONE_NUM': 13}
3298


We will need to do some chunking of the dataset here, because right now most texts exceeed the 512 BERT limit

In [29]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", value=0.0, truncating="post", padding="post")
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post")

Token indices sequence length is longer than the specified maximum sequence length for this model (753 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (729 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1071 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1927 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t