# Set Environment

## import package

In [177]:
!pip install pythainlp



In [178]:
import pandas as pd
import numpy as np
import torchtext
import torch
import time
from torch import nn
import json, re, unicodedata, string, typing, time
import torch.nn.functional as F
import spacy
from collections import Counter
import pickle
from pythainlp.tokenize import word_tokenize
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## mount google drive

In [179]:
#mount my google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [180]:
%cd /content/drive/MyDrive/Colab Notebooks/BADS9000_IS/Colab-DrQA

/content/drive/MyDrive/Colab Notebooks/BADS9000_IS/Colab-DrQA


# Load Data

In [181]:
def load_pickle(path_file):
    with open(path_file, 'rb') as file:
        load_obj = pickle.load(file)
        print(f"load object from {path_file} success,that is {type(load_obj)}")
        return load_obj

In [183]:
%%time
file = "ThaiQACorpus"
df_qa = load_pickle(f"./drqa/1-tokenizers/result/df_{file}_prepairing.pkl")
word2idx = load_pickle("./drqa/1-tokenizers/result/dict_word2idx.pkl")
idx2word  = load_pickle("./drqa/1-tokenizers/result/dict_idx2word.pkl")
word_vocab = load_pickle("./drqa/1-tokenizers/result/list_word_vocab.pkl")
glove_dict =  load_pickle("./drqa/1-tokenizers/result/dict_embed_ltw2v.pkl")
weights_matrix = np.load('./drqa/1-tokenizers/result/dfqa2v_ltw2v.npy')

load object from ./drqa/1-tokenizers/result/df_ThaiQACorpus_prepairing.pkl success,that is <class 'pandas.core.frame.DataFrame'>
load object from ./drqa/1-tokenizers/result/dict_word2idx.pkl success,that is <class 'dict'>
load object from ./drqa/1-tokenizers/result/dict_idx2word.pkl success,that is <class 'dict'>
load object from ./drqa/1-tokenizers/result/list_word_vocab.pkl success,that is <class 'list'>
load object from ./drqa/1-tokenizers/result/dict_embed_ltw2v.pkl success,that is <class 'dict'>
CPU times: user 2.81 s, sys: 1.52 s, total: 4.33 s
Wall time: 4.37 s


In [184]:
print(f"number of element in dict {len(glove_dict):0,.0f}")
print(f"shape of element in dict {glove_dict['that'].shape}")
print(f"type of element in dict {type(glove_dict['that'])}")
print(f"number of word not found in dict : {np.sum(weights_matrix.sum(axis=1)==0):0,.0f}")

number of element in dict 731,185
shape of element in dict (300,)
type of element in dict <class 'numpy.ndarray'>
number of word not found in dict : 24,627


In [185]:
from sklearn.model_selection import train_test_split
df_qa = df_qa[df_qa.context_ids.apply(lambda x:len(x))<850][['id', 'context', 'question', 'label', 'answer', 'context_ids', 'question_ids', 'label_idx']].reset_index(drop=True)
train_df, valid_df = train_test_split(df_qa, test_size=0.1 , random_state=12345)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
print(f"type(train_df):{type(train_df)}, train_df.shape:{train_df.shape}, #columns:{len(train_df.columns)}")
print(f"type(valid_df):{type(valid_df)}, valid_df.shape:{valid_df.shape}, #columns:{len(valid_df.columns)}")

type(train_df):<class 'pandas.core.frame.DataFrame'>, train_df.shape:(9306, 8), #columns:8
type(valid_df):<class 'pandas.core.frame.DataFrame'>, valid_df.shape:(1034, 8), #columns:8


# Create Torch Batch

In [186]:
class SquadDataset:
    '''
    -Divides the dataframe in batches.
    -Pads the contexts and questions dynamically for each batch by padding 
     the examples to the maximum-length sequence in that batch.
    -Calculates masks for context and question.
    -Calculates spans for contexts.
    '''
    
    def __init__(self, data, batch_size):
        
        self.batch_size = batch_size
        data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
        self.data = data
    
    def get_span(self, text):
        
        lst_token = word_tokenize(text ,engine='newmm')
        span  = [(len("".join(lst_token[0:i])), len("".join(lst_token[0:i+1]))) 
                         for i,w in enumerate(lst_token)]
        return span

    def __len__(self):
        return len(self.data)
    
    def __iter__(self):
        '''
        Creates batches of data and yields them.
        
        Each yield comprises of:
        :padded_context: padded tensor of contexts for each batch 
        :padded_question: padded tensor of questions for each batch 
        :context_mask & question_mask: zero-mask for question and context
        :label: start and end index wrt context_ids
        :context_text,answer_text: used while validation to calculate metrics
        :context_spans: spans of context text
        :ids: question_ids used in evaluation
        '''
        
        for batch in self.data:
                            
            spans = []
            context_text = []
            answer_text = []
            
            max_context_len = max([len(ctx) for ctx in batch.context_ids])
            padded_context = torch.LongTensor(len(batch), max_context_len).fill_(1)
            
            for ctx in batch.context:
                context_text.append(ctx)
                spans.append(self.get_span(ctx))
            
            for ans in batch.answer:
                answer_text.append(ans)
                
            for i, ctx in enumerate(batch.context_ids):
                padded_context[i, :len(ctx)] = torch.LongTensor(ctx)
            
            max_question_len = max([len(ques) for ques in batch.question_ids])
            padded_question = torch.LongTensor(len(batch), max_question_len).fill_(1)
            
            for i, ques in enumerate(batch.question_ids):
                padded_question[i,: len(ques)] = torch.LongTensor(ques)
                
            
            label = torch.LongTensor(list(batch.label_idx))
            context_mask = torch.eq(padded_context, 1)
            question_mask = torch.eq(padded_question, 1)
            
            ids = list(batch.id)  
            
            yield (padded_context, padded_question, context_mask, 
                   question_mask, label, context_text, answer_text, ids)

In [187]:
#if maximum number of token is too big, it use more GPU
# context_ids , question_ids are to big, it have size 200K when in english have size 810
print(f"maximum number of token in question_ids : {train_df.context_ids.apply(lambda x:len(x)).max()}")
print(f"maximum number of token in question_ids : {train_df.question_ids.apply(lambda x:len(x)).max()}")

maximum number of token in question_ids : 849
maximum number of token in question_ids : 57


In [188]:
%time
train_dataset = SquadDataset(train_df, 32)
valid_dataset = SquadDataset(valid_df, 32)
print(f"type(train_dataset):{type(train_dataset)}")
print(f"type(valid_dataset):{type(valid_dataset)}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
type(train_dataset):<class '__main__.SquadDataset'>
type(valid_dataset):<class '__main__.SquadDataset'>


In [189]:
train_df.head(1)

Unnamed: 0,id,context,question,label,answer,context_ids,question_ids,label_idx
0,7983,บ้านซอยสวนพลู บ้านซอยสวนพลู เป็นชื่อที่ใช้เรีย...,ม.ร.ว.คึกฤทธิ์ ปราโมช เข้าอยู่อาศัยในบ้านซอยสว...,"[434, 443]",พ.ศ. 2503,"[139, 1612, 556, 5721, 2, 139, 1612, 556, 5721...","[4945, 6018, 2, 4356, 2, 112, 6809, 3, 139, 16...","[128, 130]"


In [190]:
padded_context, padded_question, context_mask, question_mask, label, context_text, answer_text, ids = next(iter(train_dataset))

In [191]:
padded_context[0][~context_mask[0]] 

tensor([  139,  1612,   556,  5721,     2,   139,  1612,   556,  5721,     2,
            7,    65,     5,    35,  7300,  3741,     8,  4945,  6018,     2,
         4356,     2,   187,     3,  2670,     2,   598,     2,  1612,    28,
         6245,     2,    23,     7,  1612,  1045,     8,  1612,   556,  5721,
            2,    10,  1612,  6771,     2,    46,    11,     2,   210,  6771,
          280,     2,   147,  6771,     2,   139,  1612,   556,  5721,   287,
          132,   120,     2,    83,     2,   837,     2,     5,  4945,  6018,
         1071,   125,   145,     2,    15,     2,  4741,     2,   409,  2061,
           36,   679,   168,     2,    83,     2,   198,     2,  2061,   161,
          198,   277,  1071,   160,  1685,  1474,  7773,     2,    27,     2,
           15,     2,  3937,     2,     6,  2061,    36,   239,   277,    90,
            2,    43,     2,   198,  1071,   160,     2,    68, 14678,     2,
         2975,     2,    13,   255,  2061,   915,    27,     2, 

In [192]:
padded_question[0][~question_mask[0]]

tensor([4945, 6018,    2, 4356,    2,  112, 6809,    3,  139, 1612,  556, 5721,
          27,   18,  104])

# Create Layer function

In [201]:
class AlignQuestionEmbedding(nn.Module):
    
    def __init__(self, input_dim):        
        
        super().__init__()
        
        self.linear = nn.Linear(input_dim, input_dim)
        
        self.relu = nn.ReLU()
        
    def forward(self, context, question, question_mask):
        
        # context = [bs, ctx_len, emb_dim]
        # question = [bs, qtn_len, emb_dim]
        # question_mask = [bs, qtn_len]
    
        ctx_ = self.linear(context)
        ctx_ = self.relu(ctx_)
        # ctx_ = [bs, ctx_len, emb_dim]
        
        qtn_ = self.linear(question)
        qtn_ = self.relu(qtn_)
        # qtn_ = [bs, qtn_len, emb_dim]
        
        qtn_transpose = qtn_.permute(0,2,1)
        # qtn_transpose = [bs, emb_dim, qtn_len]
        
        align_scores = torch.bmm(ctx_, qtn_transpose)
        # align_scores = [bs, ctx_len, qtn_len]
        
        qtn_mask = question_mask.unsqueeze(1).expand(align_scores.size())
        # qtn_mask = [bs, 1, qtn_len] => [bs, ctx_len, qtn_len]
        
        # Fills elements of self tensor(align_scores) with value(-float(inf)) where mask is True. 
        # The shape of mask must be broadcastable with the shape of the underlying tensor.
        align_scores = align_scores.masked_fill(qtn_mask == 1, -float('inf'))
        # align_scores = [bs, ctx_len, qtn_len]
        
        align_scores_flat = align_scores.view(-1, question.size(1))
        # align_scores = [bs*ctx_len, qtn_len]
        
        alpha = F.softmax(align_scores_flat, dim=1)
        alpha = alpha.view(-1, context.shape[1], question.shape[1])
        # alpha = [bs, ctx_len, qtn_len]
        
        align_embedding = torch.bmm(alpha, question)
        # align = [bs, ctx_len, emb_dim]
        
        return align_embedding

In [202]:
class StackedBiLSTM(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        
        super().__init__()
        
        self.dropout = dropout
        
        self.num_layers = num_layers
        
        self.lstms = nn.ModuleList()
        
        for i in range(self.num_layers):
            
            input_dim = input_dim if i == 0 else hidden_dim * 2
            
            self.lstms.append(nn.LSTM(input_dim, hidden_dim,
                                      batch_first=True, bidirectional=True))
           
    
    def forward(self, x):
        # x = [bs, seq_len, feature_dim]

        outputs = [x]
        for i in range(self.num_layers):

            lstm_input = outputs[-1]
            lstm_out = F.dropout(lstm_input, p=self.dropout)
            lstm_out, (hidden, cell) = self.lstms[i](lstm_input)
           
            outputs.append(lstm_out)

    
        output = torch.cat(outputs[1:], dim=2)
        # [bs, seq_len, num_layers*num_dir*hidden_dim]
        
        output = F.dropout(output, p=self.dropout)
      
        return output

In [203]:
class LinearAttentionLayer(nn.Module):
    
    def __init__(self, input_dim):
        
        super().__init__()
        
        self.linear = nn.Linear(input_dim, 1)
        
    def forward(self, question, question_mask):
        
        # question = [bs, qtn_len, input_dim] = [bs, qtn_len, bi_lstm_hid_dim]
        # question_mask = [bs,  qtn_len]
        
        qtn = question.view(-1, question.shape[-1])
        # qtn = [bs*qtn_len, hid_dim]
        
        attn_scores = self.linear(qtn)
        # attn_scores = [bs*qtn_len, 1]
        
        attn_scores = attn_scores.view(question.shape[0], question.shape[1])
        # attn_scores = [bs, qtn_len]
        
        attn_scores = attn_scores.masked_fill(question_mask == 1, -float('inf'))
        
        alpha = F.softmax(attn_scores, dim=1)
        # alpha = [bs, qtn_len]
        
        return alpha      

In [204]:
def weighted_average(x, weights):
    # x = [bs, len, dim]
    # weights = [bs, len]
    
    weights = weights.unsqueeze(1)
    # weights = [bs, 1, len]
    
    w = weights.bmm(x).squeeze(1)
    # w = [bs, 1, dim] => [bs, dim]
    
    return w

In [205]:
class BilinearAttentionLayer(nn.Module):
    
    def __init__(self, context_dim, question_dim):
        
        super().__init__()
        
        self.linear = nn.Linear(question_dim, context_dim)
        
    def forward(self, context, question, context_mask):
        
        # context = [bs, ctx_len, ctx_hid_dim] = [bs, ctx_len, hid_dim*6] = [bs, ctx_len, 768]
        # question = [bs, qtn_hid_dim] = [bs, qtn_len, 768]
        # context_mask = [bs, ctx_len]
        
        qtn_proj = self.linear(question)
        # qtn_proj = [bs, ctx_hid_dim]
        
        qtn_proj = qtn_proj.unsqueeze(2)
        # qtn_proj = [bs, ctx_hid_dim, 1]
        
        scores = context.bmm(qtn_proj)
        # scores = [bs, ctx_len, 1]
        
        scores = scores.squeeze(2)
        # scores = [bs, ctx_len]
        
        scores = scores.masked_fill(context_mask == 1, -float('inf'))
        
        #alpha = F.log_softmax(scores, dim=1)
        # alpha = [bs, ctx_len]
        
        return scores

# Create Model

## Define Sturcture

In [206]:
class DocumentReader(nn.Module):
    
    def __init__(self, hidden_dim, embedding_dim, num_layers, num_directions, dropout, device):
        
        super().__init__()
        
        self.device = device
        
        #self.embedding = self.get_glove_embedding()
        
        self.context_bilstm = StackedBiLSTM(embedding_dim * 2, hidden_dim, num_layers, dropout)
        
        self.question_bilstm = StackedBiLSTM(embedding_dim, hidden_dim, num_layers, dropout)
        
        self.glove_embedding = self.get_glove_embedding()
        
        def tune_embedding(grad, words=61036):
            grad[words:] = 0
            return grad
        
        #self.glove_embedding.weight.register_hook(tune_embedding)
        
        self.align_embedding = AlignQuestionEmbedding(embedding_dim)
        
        self.linear_attn_question = LinearAttentionLayer(hidden_dim*num_layers*num_directions) 
        
        self.bilinear_attn_start = BilinearAttentionLayer(hidden_dim*num_layers*num_directions, 
                                                          hidden_dim*num_layers*num_directions)
        
        self.bilinear_attn_end = BilinearAttentionLayer(hidden_dim*num_layers*num_directions,
                                                        hidden_dim*num_layers*num_directions)
        
        self.dropout = nn.Dropout(dropout)
   
        
    def get_glove_embedding(self):
        
        weights_matrix = np.load('./drqa/1-tokenizers/result/dfqa2v_ltw2v.npy')
        num_embeddings, embedding_dim = weights_matrix.shape
        embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix).to(self.device),freeze=False ,padding_idx=0)

        return embedding
    
    
    def forward(self, context, question, context_mask, question_mask):
       
        # context = [bs, len_c]
        # question = [bs, len_q]
        # context_mask = [bs, len_c]
        # question_mask = [bs, len_q]
        
        
        ctx_embed = self.glove_embedding(context)
        # ctx_embed = [bs, len_c, emb_dim]
        
        ques_embed = self.glove_embedding(question)
        # ques_embed = [bs, len_q, emb_dim]
        

        ctx_embed = self.dropout(ctx_embed)
     
        ques_embed = self.dropout(ques_embed)
             
        align_embed = self.align_embedding(ctx_embed, ques_embed, question_mask)
        # align_embed = [bs, len_c, emb_dim]  
        
        ctx_bilstm_input = torch.cat([ctx_embed, align_embed], dim=2)
        # ctx_bilstm_input = [bs, len_c, emb_dim*2]
                
        ctx_outputs = self.context_bilstm(ctx_bilstm_input)
        # ctx_outputs = [bs, len_c, hid_dim*layers*dir] = [bs, len_c, hid_dim*6]
       
        qtn_outputs = self.question_bilstm(ques_embed)
        # qtn_outputs = [bs, len_q, hid_dim*6]
    
        qtn_weights = self.linear_attn_question(qtn_outputs, question_mask)
        # qtn_weights = [bs, len_q]
            
        qtn_weighted = weighted_average(qtn_outputs, qtn_weights)
        # qtn_weighted = [bs, hid_dim*6]
        
        start_scores = self.bilinear_attn_start(ctx_outputs, qtn_weighted, context_mask)
        # start_scores = [bs, len_c]
         
        end_scores = self.bilinear_attn_end(ctx_outputs, qtn_weighted, context_mask)
        # end_scores = [bs, len_c]
        
      
        return start_scores, end_scores

## Model Setting

In [207]:
device = torch.device('cuda')
EMB_DIM = 300
HIDDEN_DIM = 128
NUM_LAYERS = 1
NUM_DIRECTIONS = 2
DROPOUT = 0.3
device = torch.device('cuda')

model = DocumentReader(HIDDEN_DIM, EMB_DIM,  NUM_LAYERS,  NUM_DIRECTIONS,  DROPOUT,  device).to(device)

In [208]:
optimizer = torch.optim.Adamax(model.parameters())

In [209]:
def count_parameters(model):
    '''Returns the number of trainable parameters in the model.'''
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 34,977,281 trainable parameters


# Create Model Training Flow

In [210]:
def train(model, train_dataset):
    '''
    Trains the model.
    '''
    
    print("Starting training ........")
    
    train_loss = 0.
    batch_count = 0
    
    # put the model in training mode
    model.train()
    
    # iterate through training data
    for batch in train_dataset:

        if batch_count % 50 == 0:
            print(f"Starting batch: {batch_count}")
        batch_count += 1

        context, question, context_mask, question_mask, label, ctx, ans, ids = batch
        
        # place the tensors on GPU
        context, context_mask, question, question_mask, label = context.to(device), context_mask.to(device),\
                                    question.to(device), question_mask.to(device), label.to(device)
        
        # forward pass, get the predictions
        preds = model(context, question, context_mask, question_mask)

        start_pred, end_pred = preds
        
        # separate labels for start and end position
        start_label, end_label = label[:,0], label[:,1]
        
        # calculate loss
        loss = F.cross_entropy(start_pred, start_label) + F.cross_entropy(end_pred, end_label)
        
        # backward pass, calculates the gradients
        loss.backward()
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        
        # update the gradients
        optimizer.step()
        
        # zero the gradients to prevent them from accumulating
        optimizer.zero_grad()

        train_loss += loss.item()

    return train_loss/len(train_dataset)

In [211]:
def valid(model, valid_dataset):
    '''
    Performs validation.
    '''
    
    print("Starting validation .........")
   
    valid_loss = 0.

    batch_count = 0
    
    f1, em = 0., 0.
    
    # puts the model in eval mode. Turns off dropout
    model.eval()
    
    predictions = {}
    
    for batch in valid_dataset:

        if batch_count % 500 == 0:
            print(f"Starting batch {batch_count}")
        batch_count += 1

        context, question, context_mask, question_mask, label, context_text, answers, ids = batch

        context, context_mask, question, question_mask, label = context.to(device), context_mask.to(device),\
                                    question.to(device), question_mask.to(device), label.to(device)

        with torch.no_grad():

            preds = model(context, question, context_mask, question_mask)

            p1, p2 = preds

            y1, y2 = label[:,0], label[:,1]

            loss = F.cross_entropy(p1, y1) + F.cross_entropy(p2, y2)

            valid_loss += loss.item()

            
            # get the start and end index positions from the model preds
            
            batch_size, c_len = p1.size()
            ls = nn.LogSoftmax(dim=1)
            mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1).unsqueeze(0).expand(batch_size, -1, -1)
            
            score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask
            score, s_idx = score.max(dim=1)
            score, e_idx = score.max(dim=1)
            s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze()
            
            # stack predictions
            for i in range(batch_size):
                id = ids[i]
                pred = context[i][s_idx[i]:e_idx[i]+1]
                pred = ' '.join([idx2word[idx.item()] for idx in pred])
                predictions[id] = pred
            
            
            
    em, f1 = evaluate(predictions)            
    return valid_loss/len(valid_dataset), em, f1

In [212]:
def evaluate(predictions):
    '''
    Gets a dictionary of predictions with question_id as key
    and prediction as value. The validation dataset has multiple 
    answers for a single question. Hence we compare our prediction
    with all the answers and choose the one that gives us
    the maximum metric (em or f1). 
    This method first parses the JSON file, gets all the answers
    for a given id and then passes the list of answers and the 
    predictions to calculate em, f1.
    
    
    :param dict predictions
    Returns
    : exact_match: 1 if the prediction and ground truth  match exactly, 0 otherwise.
    : f1_score: 
    '''
    f1 = exact_match = total = 0
    for ctx_id in  valid_df.id[valid_df.id.isin(predictions.keys())].unique():
      ground_truths = valid_df[valid_df.id == ctx_id]["answer"].to_list()
      prediction = predictions[ctx_id]
      exact_match += metric_max_over_ground_truths( exact_match_score, prediction, ground_truths)
      f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
      total += 1
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    
    return exact_match, f1

In [213]:
def normalize_answer(s):
    '''
    Performs a series of cleaning steps on the ground truth and 
    predicted answer.
    '''
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [214]:
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    '''
    Returns maximum value of metrics for predicition by model against
    multiple ground truths.
    
    :param func metric_fn: can be 'exact_match_score' or 'f1_score'
    :param str prediction: predicted answer span by the model
    :param list ground_truths: list of ground truths against which
                               metrics are calculated. Maximum values of 
                               metrics are chosen.
                            
    
    '''
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
        
    return max(scores_for_ground_truths)

In [215]:
def f1_score(prediction, ground_truth):
    '''
    Returns f1 score of two strings.
    '''
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [216]:
def exact_match_score(prediction, ground_truth):
    '''
    Returns exact_match_score of two strings.
    '''
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

In [217]:
def epoch_time(start_time, end_time):
    '''
    Helper function to record epoch time.
    '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Training Model

In [218]:
start_tm = time.time()

train_losses = []
valid_losses = []
ems = []
f1s = []
epochs = 1

for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    
    start_time = time.time()
    
    train_loss = train(model, train_dataset)
    valid_loss, em, f1 = valid(model, valid_dataset)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    ems.append(em)
    f1s.append(f1)
    
    print(f"Epoch train loss : {train_loss}| Time: {epoch_mins}m {epoch_secs}s")
    print(f"Epoch valid loss: {valid_loss}")
    print(f"Epoch EM: {em}")
    print(f"Epoch F1: {f1}")
    print("====================================================================================")

print(f"Total Runingtime {time.time() - start_tm}")

Epoch 1
Starting training ........
Starting batch: 0
Starting batch: 50
Starting batch: 100
Starting batch: 150
Starting batch: 200
Starting batch: 250
Starting validation .........
Starting batch 0
Epoch train loss : nan| Time: 1m 30s
Epoch valid loss: nan
Epoch EM: 2.0309477756286265
Epoch F1: 3.3365570599613164
Total Runingtime 90.05917000770569


# Check Training

In [219]:
def loopBatch(dataset,loop_n):
  for i,batch in enumerate(dataset):
    if i >loop_n:
      break
    else :
      context, question, context_mask, question_mask, label, context_text, answers, ids  = batch
  return context, question, context_mask, question_mask, label, context_text, answers, ids

In [220]:
device = torch.device('cuda')
# place the tensors on GPU
context, question, context_mask, question_mask, label, context_text, answers, ids = loopBatch(train_dataset,3)
context, context_mask, question, question_mask, label = context.to(device), context_mask.to(device), question.to(device), question_mask.to(device), label.to(device)

In [221]:
preds = model(context, question, context_mask, question_mask)
start_pred, end_pred = preds
print(start_pred)

tensor([[nan, nan, nan,  ..., -inf, -inf, -inf],
        [nan, nan, nan,  ..., -inf, -inf, -inf],
        [nan, nan, nan,  ..., -inf, -inf, -inf],
        ...,
        [nan, nan, nan,  ..., -inf, -inf, -inf],
        [nan, nan, nan,  ..., -inf, -inf, -inf],
        [nan, nan, nan,  ..., -inf, -inf, -inf]], device='cuda:0',
       grad_fn=<MaskedFillBackward0>)


In [None]:
for i in range(0,20):
  context, question, context_mask, question_mask, label, context_text, answers, ids = loopBatch(train_dataset,3)
  context, context_mask, question, question_mask, label = context.to(device), context_mask.to(device), question.to(device), question_mask.to(device), label.to(device)
  preds = model(context, question, context_mask, question_mask)
  print(f"Batch {i} : loss : {F.cross_entropy(preds[0], label[:,0]) + F.cross_entropy(preds[1], label[:,1])}")

Batch 0 : loss : 11.424909591674805
Batch 1 : loss : 11.35029411315918
Batch 2 : loss : 11.533506393432617
Batch 3 : loss : 11.459067344665527
Batch 4 : loss : 11.364566802978516
Batch 5 : loss : 11.543905258178711
Batch 6 : loss : 11.60793399810791
Batch 7 : loss : 11.500799179077148
Batch 8 : loss : 11.467121124267578
Batch 9 : loss : 11.52192497253418
Batch 10 : loss : 11.595657348632812
Batch 11 : loss : 11.583470344543457
Batch 12 : loss : 11.617942810058594
Batch 13 : loss : 11.575180053710938
Batch 14 : loss : 11.454253196716309
Batch 15 : loss : 11.449477195739746
Batch 16 : loss : 11.552932739257812
Batch 17 : loss : 11.547327041625977
Batch 18 : loss : 11.537164688110352
Batch 19 : loss : 11.581380844116211


In [None]:
# forward pass, get the predictions
preds = model(context, question, context_mask, question_mask)
start_pred, end_pred = preds
print(preds)
        
# separate labels for start and end position
start_label, end_label = label[:,0], label[:,1]
        
# calculate loss
loss = F.cross_entropy(start_pred, start_label) + F.cross_entropy(end_pred, end_label)
print(loss)

(tensor([[ 0.2457,  0.1388, -0.0681,  ...,    -inf,    -inf,    -inf],
        [ 0.4133,  0.4432,  0.8993,  ...,    -inf,    -inf,    -inf],
        [ 0.5402,  0.2073, -0.1529,  ...,    -inf,    -inf,    -inf],
        ...,
        [ 0.2609,  0.4207, -0.2915,  ...,    -inf,    -inf,    -inf],
        [-0.1516, -0.5904, -0.5457,  ...,    -inf,    -inf,    -inf],
        [ 0.5432,  0.1943,  0.2013,  ...,    -inf,    -inf,    -inf]],
       device='cuda:0', grad_fn=<MaskedFillBackward0>), tensor([[-0.0410, -0.2035,  0.1451,  ...,    -inf,    -inf,    -inf],
        [ 0.7891,  0.3434,  0.3288,  ...,    -inf,    -inf,    -inf],
        [-0.2050, -0.2274, -0.6105,  ...,    -inf,    -inf,    -inf],
        ...,
        [-0.6340, -1.2802, -0.1492,  ...,    -inf,    -inf,    -inf],
        [-0.2392,  0.2916, -0.4470,  ...,    -inf,    -inf,    -inf],
        [ 0.5709, -0.0878,  0.0837,  ...,    -inf,    -inf,    -inf]],
       device='cuda:0', grad_fn=<MaskedFillBackward0>))
tensor(11.4667, dev

In [None]:
def tune_embedding(grad, words=1000):
    grad[words:] = 0
    return grad

In [None]:
embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix).to(device),freeze=False ,padding_idx=0)
embedding.weight.register_hook(tune_embedding)

<torch.utils.hooks.RemovableHandle at 0x7fcc30b0af50>

In [None]:
ctx_embed = embedding(context).to(device)
ques_embed = embedding(question).to(device)
dropout = nn.Dropout(0.3)
ctx_embed = dropout(ctx_embed).to(device)
ques_embed = dropout(ques_embed).to(device)
print(f"Size of ctx_embed {ctx_embed.size()}")
print(f"Size of ques_embed {ques_embed.size()}")

Size of ctx_embed torch.Size([32, 774, 300])
Size of ques_embed torch.Size([32, 30, 300])


In [None]:
align_embedding = AlignQuestionEmbedding(300).to(device)
align_embed = align_embedding(ctx_embed, ques_embed, question_mask)

In [None]:
ctx_bilstm_input = torch.cat([ctx_embed, align_embed], dim=2)

In [None]:
context_bilstm = StackedBiLSTM(300 * 2, 128, 1, 0.3).to(device)
ctx_outputs = context_bilstm(ctx_bilstm_input)

In [None]:
question_bilstm = StackedBiLSTM(300, 128, 1, 0.3).to(device)
qtn_outputs = question_bilstm(ques_embed)
print(f"Size of qtn_outputs {qtn_outputs.size()}")

Size of qtn_outputs torch.Size([32, 30, 256])


In [None]:
linear_attn_question = LinearAttentionLayer(128*1*2) .to(device)
qtn_weights = linear_attn_question(qtn_outputs, question_mask)
print(f"Size of qtn_weights {qtn_weights.size()}")

Size of qtn_weights torch.Size([32, 30])


In [None]:
qtn_weighted = weighted_average(qtn_outputs, qtn_weights)
print(f"Size of qtn_weighted {qtn_weighted.size()}")

Size of qtn_weighted torch.Size([32, 256])


In [None]:
bilinear_attn_start = BilinearAttentionLayer(128*1*2, 128*1*2).to(device)
start_scores = bilinear_attn_start(ctx_outputs, qtn_weighted, context_mask)

In [None]:
bilinear_attn_end = BilinearAttentionLayer(128*1*2, 128*1*2).to(device)
end_scores = bilinear_attn_end(ctx_outputs, qtn_weighted, context_mask)

In [None]:
F.cross_entropy(start_scores, start_label) + F.cross_entropy(end_scores, end_label)

tensor(11.7582, device='cuda:0', grad_fn=<AddBackward0>)

In [None]:
start_scores

tensor([[-0.3630, -0.2777, -0.1493,  ...,    -inf,    -inf,    -inf],
        [-0.2866, -0.0476, -0.4334,  ...,    -inf,    -inf,    -inf],
        [ 0.0096, -0.0054,  0.1559,  ...,    -inf,    -inf,    -inf],
        ...,
        [ 0.6172,  0.6508,  0.3309,  ...,    -inf,    -inf,    -inf],
        [-0.6726, -0.2234,  0.0203,  ...,    -inf,    -inf,    -inf],
        [-0.8651, -0.4073, -0.6602,  ...,    -inf,    -inf,    -inf]],
       device='cuda:0', grad_fn=<MaskedFillBackward0>)

In [None]:
start_label

tensor([133, 272,  24,   0,  96,  33,  27,  18,  11,  35, 329, 303, 187,  13,
         47, 159,  33,  58, 145,  15,   0, 112,  99,  42,  93,  64,  18,  15,
        342,  18,   0,  33], device='cuda:0')

In [None]:
EMB_DIM = 300
HIDDEN_DIM = 128
NUM_LAYERS = 1
NUM_DIRECTIONS = 2
DROPOUT = 0.3