In [1]:
import pandas as pd
import numpy as np
import json
import os
import tensorflow_hub as hub
from tensorflow.keras.models import Model
import tensorflow as tf
from transformers import BertTokenizer
from html.parser import HTMLParser
import tensorflow_addons as tfa
from tqdm import tqdm

For the model we will be using tensorflow to implement a BERT model. On top of the BERT layer, there will be an additional 128 neuron layer before the output. The output layer will consist of 2 neurons, giving the probabiliy of the candidate being correct or wrong. 

The first thing we need to do get our data in the right input formart for BERT. BERT expects input data in a specific format, with special tokens to mark the beginning ([CLS]) and separation/end of sentences ([SEP]). Furthermore, we need to tokenize our text into tokens that correspond to BERT’s vocabulary.

The BERT layer requires 3 input sequence: <br><br>
<b>Token ids:</b> for every token in the sentence. This will be from the BERT vocab file <br><br>
<b>Mask ids:</b> for every token to mask out tokens used only for the sequence padding (so every sequence has the same length). The max sequence length we are using will be 256, so if an example is only 200 tokens long, the frist 200 entires in Mask Ids will be 1 and last 56 entries will be 0 <br><br>
<b>Segment ids:</b> 0 for one-sentence sequence, 1 if there are two sentences in the sequence and it is the second one<br><br>
The data will have to be in the form of tensorflow tensors. We will now write the function to create input data with all these conditions

In [3]:
vocab_path = "assets/vocab.txt" #Bert Vocab file
df = pd.read_csv("assets/5000_rows.csv")
tokenizer = BertTokenizer(vocab_file=vocab_path)

In [4]:
df.head()

Unnamed: 0,ids,questions,candidate,target,short_answer,has_sa,cos_s
0,667957236772653012,what was the number one song in june 1994,<Table> <Tr> <Th> Issue date </Th> <Th> Song <...,1,`` I Swear '',1,0.011881
1,6844915846944003488,when did the live action scooby doo come out,"<P> On Rotten Tomatoes , the film has an appro...",0,"June 14 , 2002",1,0.229416
2,6722765189438257616,who wrote what the hell did i say,<P> `` What the Hell Did I Say '' is a song by...,1,country music artist Dierks Bentley,1,0.348899
3,8133224726310396603,who narrated the original 1966 how the grinch ...,<P> Because Thurl Ravenscroft was not credited...,0,Boris Karloff,1,0.394178
4,7521828414792809112,when is it legal to smoke weed in canada,<P> In the 1960s cannabis began to rapidly inc...,0,,0,0.287019


In [5]:
tokenizer = BertTokenizer(vocab_file=vocab_path)
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def tester(x,df,y=256,p=False,train=True):
    input_ids = []
    input_masks = []
    input_segments = []
    
    ml = y
    for i in df.head(x).index:
        ans = strip_tags(str(df.loc[i,'candidate']))
        ans = ans.replace(".","[SEP]")
        ans = ans.replace('""', '')
        ans = ans.replace('``','')
        ans = ans.replace("''", "")
        #ans = "[CLS] " + ans
        ans = " ".join(ans.split())

        qn = strip_tags(str(df.loc[i,'questions']))
        qn = qn.replace(".","[SEP]")
        qn = "[CLS] " + qn + "[CLS]"
        fn = qn + " " + ans
        if p:
            print(fn)
        toks = tokenizer.tokenize(fn)
        tokens = toks[:ml]
        input_ids.append(get_ids(tokens,tokenizer,ml))
        input_masks.append(get_masks(tokens,ml))
        input_segments.append(get_segments(tokens,ml))
    
    if train:
        inputs = [tf.convert_to_tensor(input_ids,dtype=float),tf.convert_to_tensor(input_masks,dtype=float),tf.convert_to_tensor(input_segments,dtype=float)
                ]

        #output = tf.convert_to_tensor(df.head(x)['target'],dtype=tf.int32)
        #return inputs,output

        output = tf.one_hot(tf.convert_to_tensor(df.head(x)['target'],dtype=tf.int32),depth=2)
        return inputs,output
    else:
        inputs = [tf.convert_to_tensor(input_ids,dtype=float),tf.convert_to_tensor(input_masks,dtype=float),tf.convert_to_tensor(input_segments,dtype=float)
                ]
        return inputs

In [6]:
inp,target = tester(1,df,p=True) #Example of the text being converted to input tensors

[CLS] what was the number one song in june 1994[CLS] Issue date Song Artist ( s ) Reference January 1 Hero Mariah Carey January 8 January 15 January 22 All for Love Bryan Adams / Rod Stewart / Sting January 29 February 5 February 12 The Power of Love Céline Dion February 19 February 26 March 5 March 12 The Sign Ace of Base March 19 March 26 April 2 April 9 Bump n ' Grind R[SEP] Kelly April 16 April 23 April 30 May 7 The Sign Ace of Base May 14 May 21 I Swear All - 4 - One May 28 June 4 June 11 June 18 June 25 July 2 July 9 July 16 July 23 July 30 August 6 Stay ( I Missed You ) Lisa Loeb & Nine Stories August 13 August 20 August 27 I 'll Make Love to You Boyz II Men September 3 September 10 September 17 September 24 October 1 October 8 October 15 October 22 October 29 November 5 November 12 November 19 November 26 December 3 On Bended Knee December 10 December 17 Here Comes the Hotstepper Ini Kamoze December 24 December 31 On Bended Knee Boyz II Men


The above is an example of the text before being conerted to the input formats. It beginds with the [CLS] token, followed by the question. After the question comes another [CLS] token and then the candidate answer

In [7]:
inp #The final input for the model

[<tf.Tensor: shape=(1, 256), dtype=float32, numpy=
 array([[  101.,  2054.,  2001.,  1996.,  2193.,  2028.,  2299.,  1999.,
          2238.,  2807.,   101.,  3277.,  3058.,  2299.,  3063.,  1006.,
          1055.,  1007.,  4431.,  2254.,  1015.,  5394.,  3814.,  2232.,
         11782.,  2254.,  1022.,  2254.,  2321.,  2254.,  2570.,  2035.,
          2005.,  2293.,  8527.,  5922.,  1013.,  8473.,  5954.,  1013.,
         12072.,  2254.,  2756.,  2337.,  1019.,  2337.,  2260.,  1996.,
          2373.,  1997.,  2293., 24550., 19542.,  2337.,  2539.,  2337.,
          2656.,  2233.,  1019.,  2233.,  2260.,  1996.,  3696.,  9078.,
          1997.,  2918.,  2233.,  2539.,  2233.,  2656.,  2258.,  1016.,
          2258.,  1023., 16906.,  1050.,  1005., 23088.,  1054.,   102.,
          5163.,  2258.,  2385.,  2258.,  2603.,  2258.,  2382.,  2089.,
          1021.,  1996.,  3696.,  9078.,  1997.,  2918.,  2089.,  2403.,
          2089.,  2538.,  1045.,  8415.,  2035.,  1011.,  1018.,  1011.,


In [8]:
target

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0., 1.]], dtype=float32)>

The target is a one hot encoded tensor. If the candidate is correct, it will be [0,1], otherwise it will be [1,0]. This format will allow us to use the Binary Crossentropy from logits as the loss function for our model

##### Now let's create the model. The BERT layer has been donwloaded and saved in the assets folder. The BERT layer was downloaded from here: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1

In [10]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True,name="bert")

In [None]:
def mk_model():
    max_seq_length = 256  # Your choice here.
    input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                           name="input_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                        name="segment_ids")
    pooled_output, sequence_output = bert_layer([input_ids, input_mask, segment_ids])
    
    x = tf.keras.layers.Dense(128,activation=tf.nn.relu,name='combined_layer',kernel_initializer=tf.random_normal_initializer)(pooled_output)
    logits = tf.keras.layers.Dense(2,activation='softmax')(x)
    model = Model(inputs=[input_ids,input_mask,segment_ids], outputs=logits)
    return model

In [None]:
model = mk_model()

In [None]:
model.summary()

The model has 109,580,930 trainable params, the output shape of the bert layer is (None,768) this feeds into a 128 neuron layer before the 2 neuron output logits layer. The model will be compiled using an Stochastic Gradient Descent optimizer and the loss will be Binary Cross Entropy. <br><br>

Binary crossentropy measures how far away from the true value (which is either 0 or 1) the prediction is for each of the classes and then averages these class-wise errors to obtain the final loss. <br><br>

Metrics used were the plain accuracy metric and the F1 score.

The F1 score considers both the precision p and the recall r of the test to compute the score: p is the number of correct positive results divided by the number of all positive results returned by the classifier, and r is the number of correct positive results divided by the number of all relevant samples (all samples that should have been identified as positive). The F1 score is the harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. (From Wikipedia)

In [None]:
import tensorflow_addons as tfa
f1 = tfa.metrics.F1Score(num_classes=2)
sgd = tf.keras.optimizers.SGD(learning_rate=0.01,momentum=0.1)
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True)
model.compile(optimizer=sgd,
              loss = loss,
              metrics=['accuracy',f1])

This model was training on a virtual instance with 1 GPU on Google Cloud Platform. Due to the large number of trainable params, the training could be done with a maximum batch size of 16 with only 1 GPU. 

### Short Answer Model

We will need a differnt function to extract data for the short answer model. We will no longer be needing long answer candidate rows. We will need each question on 1 row with it's correct long and short answer (if any)

In [None]:
def get_data(t):
    
    ex_id = t['example_id']
    q = t['question_text']
    yn = t['annotations'][0]['yes_no_answer']
    ans1 = t['annotations'][0]['long_answer']['start_token']
    ans2 = t['annotations'][0]['long_answer']['end_token']
    sp_doc = t['document_text'].split(" ")
    answer = " ".join(sp_doc[ans1:ans2])
    
    if int(bool(t['annotations'][0]['short_answers'])):
        sa = sp_doc[t['annotations'][0]['short_answers'][0]['start_token']:t['annotations'][0]['short_answers'][0]['end_token']]
        sa = " ".join(sa)
        start = t['annotations'][0]['short_answers'][0]['start_token']
        end = t['annotations'][0]['short_answers'][0]['end_token']
    else:
        sa = None
        start = None
        end = None
    r = {"ids":[ex_id],
         "question":[q],
         "answer":[answer],
         "short_answer": [sa],
         "has_yn":[yn],
         'start': [start],
         'end':[end]
         
        }
    r = pd.DataFrame(r)
    return r

In [None]:
def ext_json_short(path,x):
    with open(path, 'rt') as json_file:
        ids = []
        question = []
        answer = []
        sa = []
        has_yn = []
        start = []
        end =[]
        cnt = 0

        
        for line in json_file:
            cnt += 1
            t = json.loads(line) 
            results = get_data(t)
                
            ids.extend(results['ids'])
            question.extend(results['question'])
            answer.extend(results['answer'])
            sa.extend(results['short_answer'])
            has_yn.extend(results['has_yn'])
            start.extend(results['start'])
            end.extend(results['end'])

            if cnt == x:
                df = pd.DataFrame()
                df['ids'] = ids
                df['questions'] = question
                df['answer'] = answer
                df['short_answer'] = sa
                df['has_yn'] = has_yn
                df['start'] = start
                df['end'] = end
                break
        
        return df
    



In [180]:
df = ext_json_short("assets/simplified-nq-train.jsonl",10)

In [None]:
df.head()

In [None]:
def yne(row): # Function to merge "has_yn" column into "short_answer column". 
    if row['has_yn'] != "NONE":
        if row['has_yn'] == "NO":
            val = "No"
        elif row['has_yn'] == "YES":
            val = "Yes"
    else:
        val = row['short_answer']
    return val

df['short_answer'] = df.apply(yne,axis=1)

We will need to create short answer candidates from the correct long answer. This will be done by breaking the long answer up into sentences and also by isolating named entities and dates. Each question will also have a Yes & No as candidates

In [183]:
import nltk
cols = ['id',"question",'candidate','long_answer','target']

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent
def created_sa_cands(df,x=False,y=False):
    ndf = pd.DataFrame(columns=cols)
    qids = []
    qs = []
    las =[]
    cands = []
    targets = []
    if x:
        df = df.loc[x:y]
    for i in tqdm(df.index):
        temp = []
        df.loc[i,'answer'] = strip_tags(str(df.loc[i,'answer']))
        
        qid = df.loc[i,'ids']
        q = df.loc[i,'questions']
        la = df.loc[i,'answer']
        sa = df.loc[i,'short_answer']
        target = 1
        qids.append(qid)
        qs.append(q)
        las.append(la)
        cands.append(sa)
        temp.append(sa)
        targets.append(target)
        sents = split_into_sentences(la)
        for sent in sents:
            qids.append(qid)
            qs.append(q)
            las.append(la)
            cands.append(sent)
            targets.append(0)
        nes = preprocess(la)
        text = ""
        
        for ne in nes:
            
            if ne[1] in ["NNP","CD"]:
                text += ne[0]
                text += " "
            elif ne[1] == "CC" and last_e in ["NNP","CD"]:
                text += ne[0]
                text += " "
            else:
                if text and text not in temp:
                    qids.append(qid)
                    qs.append(q)
                    las.append(la)
                    cands.append(text)
                    temp.append(text)
                    targets.append(0)
                    text = ""
            last_e = ne[1]
        qids.append(qid)
        qs.append(q)
        las.append(la)
        cands.append("Yes")
        targets.append(0)
        qids.append(qid)
        qs.append(q)
        las.append(la)
        cands.append("No")
        targets.append(0)
        
    ndf['id'] = qids
    ndf['question'] = qs
    ndf['long_answer'] = las
    ndf['candidate'] = cands
    ndf['target'] = targets
    return ndf

import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [185]:
sa_df = created_sa_cands(df,0,20)

100%|██████████| 10/10 [00:00<00:00, 57.51it/s]


In [197]:
temp = sa_df.loc[sa_df['id'] == 5328212470870865242]
temp.reset_index(inplace=True,drop=True)
print("Long answer: {}\n".format(temp.loc[0,'long_answer'].replace("  ", " ")))
for i in temp.index:
    print("Candidate {} : {}".format(i,temp.loc[i,'candidate']))

Long answer:  Tracy McConnell , better known as `` The Mother '' , is the title character from the CBS television sitcom How I Met Your Mother . The show , narrated by Future Ted , tells the story of how Ted Mosby met The Mother . Tracy McConnell appears in 8 episodes from `` Lucky Penny '' to `` The Time Travelers '' as an unseen character ; she was first seen fully in `` Something New '' and was promoted to a main character in season 9 . The Mother is played by Cristin Milioti . 

Candidate 0 : Tracy McConnell
Candidate 1 : Tracy McConnell , better known as `` The Mother '' , is the title character from the CBS television sitcom How I Met Your Mother .
Candidate 2 : The show , narrated by Future Ted , tells the story of how Ted Mosby met The Mother .
Candidate 3 : Tracy McConnell appears in 8 episodes from `` Lucky Penny '' to `` The Time Travelers '' as an unseen character ; she was first seen fully in `` Something New '' and was promoted to a main character in season 9 .
Candidate 

Next we will need functions to prep the data to be inputed to the model. Simillar to the long answer functions but with slight modifications

In [190]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def tester(x,df,y=256):
    #del df
    df.reset_index(drop=True,inplace=True)
    input_ids = []
    input_masks = []
    input_segments = []
    p = list(df.head(x).index)
    ml = y
    for i in tqdm(p):
        ans = strip_tags(str(df.loc[i,'long_answer']))
        ans = ans.replace(".","[SEP]")
        ans = ans.replace('""', '')
        ans = ans.replace('``','')
        ans = ans.replace("''", "")
        ans = " ".join(ans.split())

        qn = strip_tags(str(df.loc[i,'question']))
        qn = qn.replace(".","[SEP]")
        qn = "[CLS] " + qn + "[CLS]"
        cn = str(df.loc[i,'candidate'])
        fn = qn + " " + cn +"[CLS]" + ans
        toks = tokenizer.tokenize(fn)
        tokens = toks[:ml]
        input_ids.append(get_ids(tokens,tokenizer,ml))
        input_masks.append(get_masks(tokens,ml))
        input_segments.append(get_segments(tokens,ml))


    inputs = [tf.convert_to_tensor(input_ids,dtype=float),tf.convert_to_tensor(input_masks,dtype=float),tf.convert_to_tensor(input_segments,dtype=float)
            ]

    output = tf.one_hot(tf.convert_to_tensor(df.head(x)['target'],dtype=tf.int32),depth=2)
    
    return inputs,output

In [191]:
inp,out = tester(20,sa_df)

100%|██████████| 20/20 [00:00<00:00, 440.62it/s]


In [170]:
def mk_sa_model():
    max_seq_length = 256  # Your choice here.

    input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                           name="input_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                        name="segment_ids")
    bert_layer1 = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True,name="bert")
    
    pooled_output, sequence_output = bert_layer1([input_ids, input_mask, segment_ids])


    x = tf.keras.layers.Dense(128,activation=tf.nn.relu,name='combined_layer_2',kernel_initializer=tf.random_normal_initializer)(pooled_output)
    
    logits = tf.keras.layers.Dense(2,activation='softmax')(x)
    model = Model(inputs=[input_ids,input_mask,segment_ids], outputs=logits)
    
    return model

In [186]:
sa_model = mk_sa_model()

In [171]:
sa_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert (KerasLayer)               [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_mask[0][0]           

Both models were trained on Google Cloud platform, refer to the scripts "la_mod.py" & "sa_mod.py" for code which ran the training in the cloud

### Model evaluation

Long answer f1 score: 0.91 
Short answer f1 score: 0.89

Long answer model is predicting well but short answer is not. Will need to rethink the modeling approach for the short answer model

The "Submission" notebook will build the final model and prepare the submission for kaggle