In [1]:
# built in
import json
import random
# append to path to allow relative imports
import sys
sys.path.append("..")

# 3rd party
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from tqdm import tqdm
from transformers import BertForTokenClassification
import torch

# own
from utils.data.parse import ParseUtils

# Prep Data

In [2]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = 3 # set a small number for experimentation, set None for production.

In [3]:
ner_data = ParseUtils.compute_ner_data(max_sample=MAX_SAMPLE)

FileNotFoundError: [Errno 2] No such file or directory: 'data/coleridgeinitiative-show-us-the-data/train.csv'

In [3]:
TRAIN_CSV = '../../data/coleridgeinitiative-show-us-the-data/train.csv'
TRAIN_DATA = '../../data/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(TRAIN_CSV)
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')

No. raw training rows: 2000


In [4]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

No. grouped training rows: 1811


In [6]:
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{TRAIN_DATA}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [7]:
cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []

pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [ParseUtils.clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([ParseUtils.clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = ParseUtils.shorten_sentences(sentences) # make sentences short
    # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if len(sentence) > 10] 
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = ParseUtils.tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")
    
    if i==MAX_SAMPLE-1:
        break
    
# shuffling
#random.shuffle(ner_data)

Training data size: 2484 positives + 51683 negatives: 100%|██████████| 1811/1811 [00:15<00:00, 122.15it/s]

In [15]:
# Write data to file
with open('train_ner.json', 'w') as f:
    for row in ner_data:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

TypeError: 'NERData' object is not iterable

In [10]:
class NERData:
    #def __init__(self):
    #    self.data = list()
    
    @staticmethod
    def to_json_file(filename:str, data):
        f = open(filename, 'w')
        for row in data:
            words, nes = list(zip(*row))
            row_json = {'tokens': words, 'tags': nes}
            json.dump(row_json, f)
            f.write('\n')
        f.close()
    
    @staticmethod
    def from_json_file(filename:str, overwrite:bool=False):
        
        #if self.data:
        #    if overwrite:
        #        self.data = list()
        #    else:
        #        raise ValueError(
        #            'Data is present. If you want to overwrite it, '
        #            'run this function again with overwrite=True.')        
        data = []
        f = open(filename, 'r')
        
        for i,line in enumerate(f.readlines()):
            
            print('Reading data ... {}\r'.format(i), end='')
            
            # Each line is formatted in JSON format, e.g.
            # { "tokens" : ["A", "short", "sentence"],
            #   "tags"   : ["0", "0", "0"] }
            sentence = json.loads(line)
            
            # From the tokens and tags, we create a list of 
            # tuples of the form
            # [ ("A", "0"), ("short", "0"), ("sentence", "0")]
            sentence_tuple_list = [
                (token, tag) for token, tag 
                in zip(sentence["tokens"],sentence["tags"])
            ]
            
            # Each of these parsed sentences becomes an entry
            # in our overall data list
            data.append(sentence_tuple_list)
            
            if i==1000:
                break
            
        f.close()
        
        return data
        
    @staticmethod
    def get_sentences(data):
        """
        Convert each entry in self.data into a single-string sentence,
        with words separated by a blank space.
        """
        return [ " ".join([ tuple_[0] for tuple_ in tupled_sentence ]) 
                for tupled_sentence in data[:100] ]

In [12]:
NERData.to_json_file('../../data/train_ner_dummy.json', ner_data)

In [162]:
ner_data = NERData.from_json_file('../../data/train_ner_dummy.json')

Reading data ... 1000

In [164]:
ner_data[0]

[('This', 'O'),
 ('research', 'O'),
 ('study', 'O'),
 ('comes', 'O'),
 ('to', 'O'),
 ('validate', 'O'),
 ('and', 'O'),
 ('quantify', 'O'),
 ('a', 'O'),
 ('number', 'O'),
 ('of', 'O'),
 ('assumptions', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Lebanese', 'O'),
 ('context', 'O')]

In [13]:
text_batch = NERData.get_sentences(ner_data)
text_batch

['This research study comes to validate and quantify a number of assumptions in the Lebanese context',
 'According to a study completed by UNDP 2016 96 of semi skilled workers in the Agro food industry suffer from basic ICT skills weaknesses 89',
 'The response rate for this follow up study was 43',
 'This study was completed in an effort to find creative and digital solutions to the high rate of youth unemployment in Lebanon 37 one of the highest rates in the world',
 'Thus the generalizability of this study could be hindered',
 'Participants chosen for this study were above 18 years old labor law in Lebanon sets the eligibility age for work at 18 years who have completed DOT Lebanon ICT training',
 'The subject of this study was one of DOT Lebanon s programs Digital Media Literacy Program DML a 2 level digital youth training program delivered in 14 different centers across Lebanon',
 'The aim of this study was to identify if acquiring ICT skills through DOT Lebanon s ICT training pro

In [60]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [73]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('validate'))

[9221, 2193]

In [71]:
tokenizer.decode(tokenizer('validate')['input_ids'])

'[CLS] validate [SEP]'

In [93]:
res = tokenizer(['validate', 'I go out today', 'Something in the water looks at me'], 
                padding=True, max_length=4, truncation=True, return_overflowing_tokens=True)
ids = res['input_ids']
overflows = res['overflowing_tokens']

In [94]:
tokenizer.batch_decode(ids)

['[CLS] validate [SEP]', '[CLS] I go [SEP]', '[CLS] Something in [SEP]']

In [96]:
tokenizer.batch_decode(overflows)

['', 'today out', 'me at looks water the']

In [78]:
tokenizer.decode([101, 9221, 2193, 102])

'[CLS] validate [SEP]'

In [80]:
tokenizer.encode('validate')

[101, 9221, 2193, 102]

In [61]:
def tokenize_and_preserve_labels(tupled_sentence):
    tokenized_sentence = []

    for (word, label) in tupled_sentence:

        # Tokenize the word
        tokenized_word = tokenizer.tokenize(word)
        
        # Repeat the label for words that are broken up into several tokens
        repeated_label = [label]*len(tokenized_word)
        
        # Add the tokenized word and its label to the final tokenized word list
        tokenized_sentence.extend(zip(tokenized_word, repeated_label))

    return tokenized_sentence

In [165]:
ner_data = [
   tokenize_and_preserve_labels(sentence)
    for sentence in ner_data
]
ner_data[0]

[('This', 'O'),
 ('research', 'O'),
 ('study', 'O'),
 ('comes', 'O'),
 ('to', 'O'),
 ('valid', 'O'),
 ('##ate', 'O'),
 ('and', 'O'),
 ('q', 'O'),
 ('##uant', 'O'),
 ('##ify', 'O'),
 ('a', 'O'),
 ('number', 'O'),
 ('of', 'O'),
 ('assumptions', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Lebanese', 'O'),
 ('context', 'O')]

In [148]:
def add_start_end_tokens(tupled_sentence):
    tupled_sentence.insert(0, ('[CLS]', 'O'))
    tupled_sentence.append(('[SEP]', 'O'))
    return tupled_sentence

In [166]:
ner_data = [
   add_start_end_tokens(sentence)
    for sentence in ner_data
]
ner_data[0]

[('[CLS]', 'O'),
 ('This', 'O'),
 ('research', 'O'),
 ('study', 'O'),
 ('comes', 'O'),
 ('to', 'O'),
 ('valid', 'O'),
 ('##ate', 'O'),
 ('and', 'O'),
 ('q', 'O'),
 ('##uant', 'O'),
 ('##ify', 'O'),
 ('a', 'O'),
 ('number', 'O'),
 ('of', 'O'),
 ('assumptions', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Lebanese', 'O'),
 ('context', 'O'),
 ('[SEP]', 'O')]

In [182]:
# Get only sentences, not labels
tokenized_sentences = [[token_label_tuple[0] for token_label_tuple in sent] for sent in ner_data ]

# Get only labels, not sentences
labels = [[token_label_tuple[1] for token_label_tuple in sent] for sent in ner_data ]

tokenized_sentences[0]

['[CLS]',
 'This',
 'research',
 'study',
 'comes',
 'to',
 'valid',
 '##ate',
 'and',
 'q',
 '##uant',
 '##ify',
 'a',
 'number',
 'of',
 'assumptions',
 'in',
 'the',
 'Lebanese',
 'context',
 '[SEP]']

In [191]:
def add_padding(tokenized_sentences, labels):
    # Note that this implicitly converts to an array of objects (strings)
    
    padded_sentences = pad_sequences(
        tokenized_sentences, 
        value='[PAD]', 
        dtype=object, 
        maxlen=MAX_LENGTH, 
        truncating='post', 
        padding='post')

    padded_labels = pad_sequences(
        labels, 
        value='O', 
        dtype=object, 
        maxlen=MAX_LENGTH, 
        truncating='post', 
        padding='post')
    
    return padded_sentences, padded_labels

In [192]:
padded_sentences, padded_labels = add_padding(tokenized_sentences, labels)

In [193]:
padded_sentences[0]

array(['[CLS]', 'This', 'research', 'study', 'comes', 'to', 'valid',
       '##ate', 'and', 'q', '##uant', '##ify', 'a', 'number', 'of',
       'assumptions', 'in', 'the', 'Lebanese', 'context', '[SEP]',
       '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',
       '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',
       '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',
       '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',
       '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',
       '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',
       '[PAD]'], dtype=object)

In [198]:
# Convert to integer ids
input_ids = [tokenizer.convert_tokens_to_ids(text) for text in padded_sentences]
print(input_ids[0])

[101, 1188, 1844, 2025, 2502, 1106, 9221, 2193, 1105, 186, 27280, 6120, 170, 1295, 1104, 19129, 1107, 1103, 12772, 5618, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [199]:
# Convert to integer ids
tags = [tokenizer.convert_tokens_to_ids(text) for text in padded_labels]
print(tags[0])

[152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152]


In [200]:
# All of the same length?
np.unique([len(sent) for sent in padded_sentences]), np.unique([len(sent) for sent in padded_labels])

(array([64]), array([64]))

In [207]:
# Can decode correctly?
tokenizer.batch_decode(input_ids[0:2])

['[CLS] This research study comes to validate and quantify a number of assumptions in the Lebanese context [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] According to a study completed by UNDP 2016 96 of semi skilled workers in the Agro food industry suffer from basic ICT skills weaknesses 89 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [217]:
# Get attention mask
# QUESTION: Also ignore [CLS] and [SEP] tokens?

def get_attention_mask(input_ids, ignore_tokens=[0,101,102]):
    return [[float(token not in ignore_tokens) for token in sent] 
                for sent in input_ids]
attention_mask = get_attention_mask(input_ids)
print(attention_mask[0])

[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


# Init Model

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

In [None]:
# Models are initialized in eval mode by default. We can call model.train() to put it in train mode.
model.train()

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

In [57]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text_batch)))
tokens, len(tokens)

(['[CLS]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',
  '[UNK]',

In [55]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
inputs = encoding['input_ids']
attention_mask = encoding['attention_mask']

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [53]:
inputs

tensor([[ 101,  100, 2470,  ...,    0,    0,    0],
        [ 101,  100, 2000,  ...,    0,    0,    0],
        [ 101,  100, 3433,  ...,    0,    0,    0],
        ...,
        [ 101,  100, 1996,  ...,    0,    0,    0],
        [ 101, 1019, 2006,  ...,    0,    0,    0],
        [ 101,  100, 2003,  ...,    0,    0,    0]])

In [39]:
len(ner_data.data[0]), text_batch

(19,
 'After acquisition of this last DKI dataset at 8 months of age mice were then sacrificed for histological analysis')

In [None]:
labels = torch.tensor([1,0]).unsqueeze(0)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
