In [1]:
# trainloader length: 2700
# testloader length: 323

In [2]:
from collections import Counter
import numpy as np
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
import os
import traceback
import re

%matplotlib inline
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 2

In [3]:
from torchtext.experimental.datasets import SQuAD1
from torchtext.data.utils import get_tokenizer
# trainset, devset = SQuAD1()
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
# dataset shape: (paragraph, question, answer, span)
trainset, devset = SQuAD1(tokenizer=tokenizer)

100%|██████████| 87599/87599 [00:23<00:00, 3743.21lines/s]


In [4]:
vocab = trainset.get_vocab()

In [5]:
import pickle
with open('train_data.pickle', 'rb') as f:
    train_data = pickle.load(f)
with open('dev_data.pickle', 'rb') as f:
    dev_data = pickle.load(f)

In [6]:
full = train_data + dev_data
counter_tag = Counter(w for row in full for w in row[-2])
vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True)
counter_ent = Counter(w for row in full for w in row[-3])
vocab_ent = sorted(counter_ent, key=counter_ent.get, reverse=True)
tag2id = {w: i for i, w in enumerate(vocab_tag)}
ent2id = {w: i for i, w in enumerate(vocab_ent)}

In [7]:
def data2tensor(row):
    p_tags = row[-2]
    p_ents = row[-3]
    match_features = row[-4]
    tag_ids = [tag2id[w] for w in p_tags]
    ent_ids = [ent2id[w] for w in p_ents]
    return row[:-4] + (match_features, ent_ids, tag_ids, torch.FloatTensor(row[-1]))
train_data = list(map(data2tensor, train_data))
dev_data = list(map(data2tensor, dev_data))

In [8]:
# trainset_small = [train_data[i] for i in range(32)]
# match_features = list(zip(*trainset_small))[6]
# match_origin, match_lower, match_lemma = list(zip(*match_features))
# padded_match_origin = torch.stack([torch.cat((torch.FloatTensor(match),
#         torch.LongTensor([1] * (809 - len(match))))) \
#         for match in match_origin])
# tensor_mo = torch.FloatTensor(padded_match_origin).unsqueeze(2)
# tensor_mo.shape

In [9]:
def pad_data(data):
    # Generate the pad id
    pad_id = vocab['<pad>']
    # Find max length of the mini-batch
    # train.get_vocab()['pad'], dev.get_vocab()['pad'] is equal to 22949
    max_p_len = max(list(zip(*data))[0])
    max_q_len = max(list(zip(*data))[1])
    paragraph_list = list(zip(*data))[2]
    question_list = list(zip(*data))[3]
    answer_list = list(zip(*data))[4]
    span_list = list(zip(*data))[5]
    match_features = list(zip(*data))[6]
    match_origin, match_lower, match_lemma = list(zip(*match_features))
    ners = list(zip(*data))[7]
    poss = list(zip(*data))[8]
    tfs = list(zip(*data))[9]
    padded_paragraphs = torch.stack([torch.cat((paragraph,
            torch.LongTensor([pad_id] * (max_p_len - len(paragraph))))) \
            for paragraph in paragraph_list])
    padded_questions = torch.stack([torch.cat((question,
            torch.tensor([pad_id] * (max_q_len - len(question))).long())) \
            for question in question_list])
    paragraph_pad_mask = torch.zeros_like(padded_paragraphs).masked_fill(padded_paragraphs == pad_id, 1)
    question_pad_mask = torch.zeros_like(padded_questions).masked_fill(padded_questions == pad_id, 1)

    padded_match_origin = torch.stack([torch.cat((torch.FloatTensor(match),
            torch.LongTensor([pad_id] * (max_p_len - len(match))))) \
            for match in match_origin]).unsqueeze(2)
    padded_match_lower = torch.stack([torch.cat((torch.FloatTensor(match),
            torch.LongTensor([pad_id] * (max_p_len - len(match))))) \
            for match in match_lower]).unsqueeze(2)
    padded_match_lemma = torch.stack([torch.cat((torch.FloatTensor(match),
            torch.LongTensor([pad_id] * (max_p_len - len(match))))) \
            for match in match_lemma]).unsqueeze(2)
    padded_ner = torch.stack([torch.cat((torch.FloatTensor(ner),
            torch.LongTensor([pad_id] * (max_p_len - len(ner))))) \
            for ner in ners]).unsqueeze(2)
    padded_pos = torch.stack([torch.cat((torch.FloatTensor(pos),
            torch.LongTensor([pad_id] * (max_p_len - len(pos))))) \
            for pos in poss]).unsqueeze(2)
    padded_tf = torch.stack([torch.cat((torch.FloatTensor(tf),
            torch.LongTensor([pad_id] * (max_p_len - len(tf))))) \
            for tf in tfs]).unsqueeze(2)

    return padded_paragraphs, padded_questions, span_list, answer_list, \
           paragraph_pad_mask, question_pad_mask, padded_match_origin, padded_match_lower, \
           padded_match_lemma, padded_ner, padded_pos, padded_tf

In [10]:
BATCH_SIZE = 32
from torch.utils.data import DataLoader
trainloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_data, num_workers=0)
testloader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_data, num_workers=0)

In [11]:
# print(trainset.get_vocab()['pad'], dev.get_vocab()['pad'])
# devset[0]

In [12]:
# glove_vec = torchtext.vocab.GloVe(name='840B', dim=300)
#

In [13]:
def build_word_embedding(vocab, pre_trained_emb_vec):
    # print(pre_trained_emb_vec.dim)
    weights_matrix = np.zeros((len(vocab), pre_trained_emb_vec.dim))
    words_found = 0
    no_word = 0
    for i, word in enumerate(vocab.itos):
        try:
            weights_matrix[i] = pre_trained_emb_vec[word]
            words_found += 1
        except:
            no_word += 1 # no such word in pre_trained_embedding: zero vector
    print('words not found:', no_word)
    print('words found:', words_found)
    # weights_matrix
    return torch.FloatTensor(weights_matrix)
#

In [14]:
if os.path.isfile('mydrqaglove_vt.npy'):
    word_emb_table = np.load('mydrqaglove_vt.npy')
else:
    glove_vec = torchtext.vocab.GloVe(name='840B', dim=300)
    word_emb_table = build_word_embedding(vocab, glove_vec)
    np.save('mydrqaglove_vt.npy',word_emb_table)

In [15]:
class AlignedQuestionEmbedding(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.relu = nn.ReLU()
        self.linear1 = nn.Linear(input_dim, input_dim)
        self.linear2 = nn.Linear(input_dim, input_dim)

    def forward(self, paragraph, question, question_pad_mask):

        p = self.relu(self.linear1(paragraph))

        q = self.relu(self.linear2(question))

        # q = self.relu(self.linear1(question))
        q = q.permute(0, 2, 1)

        dot_product = torch.bmm(p, q)
        # print(dot_product.size())
        # print(question_pad_mask.size())
        question_mask_expand = question_pad_mask.unsqueeze(1).expand(dot_product.size())
        dot_product = dot_product.masked_fill(question_mask_expand == 1, -float('inf'))

        attn_score = F.softmax(dot_product.view(-1, question.size(1)), dim=1)
        attn_score = attn_score.view(-1, paragraph.shape[1], question.shape[1])

        aligned_embedding = torch.bmm(attn_score, question)
        return aligned_embedding

In [16]:
class MultiLayerBiLSTM(nn.Module):

    def __init__(self, input_size, hidden_size, nlayers, dropout):
        super().__init__()
        self.nlayers = nlayers

        self.lstms = nn.ModuleList()
        self.dropout = nn.Dropout(p=dropout)
        self.lstms.append(nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True))
        for i in range(1, nlayers):
            self.lstms.append(nn.LSTM(hidden_size * 2, hidden_size,
                                      batch_first=True, bidirectional=True))

    def forward(self, x):
        x = self.dropout(x)
        lstm_output, _ = self.lstms[0](x)
        hidden_states = [lstm_output]
        # print(lstm_output.size(), hidden_state.size(), cell_state.size())
        for i in range(1, self.nlayers):
            # lstm_output = self.dropout(lstm_output)
            lstm_output, _ = self.lstms[i](lstm_output)
            # print(lstm_output.size(), hidden_state.size(), cell_state.size())
            hidden_states.append(lstm_output)

        output = torch.cat(hidden_states, dim=2)

        output = self.dropout(output)
        return output


In [17]:
class QuestionEncoding(nn.Module):
    def __init__(self, input_size, hidden_size, nlayers, dropout):
        super().__init__()
        self.lstm_output_size = hidden_size * 6
        self.linear = nn.Linear(self.lstm_output_size, 1)
        self.lstm = MultiLayerBiLSTM(input_size, hidden_size, nlayers, dropout)
        # biLSTM output size: hidden size * 6
    def forward(self, x, question_mask):
        try:
            x = self.lstm(x)
            b = x.contiguous().view(-1, x.size(-1))
            b = self.linear(b) # attention score
            b = b.view(x.shape[0], -1)
            # print(x.size(), question_mask.size())
            b = b.masked_fill(question_mask == 1, -float('inf')) # masking
            b = F.softmax(b, dim=1)

            encoding = b.unsqueeze(1).bmm(x).squeeze(1)
            # print(x.size(), x_lstm.size())
            return encoding
        except:
            print('question mask size:', question_mask.size())
            print('x size:', x.size())
            print('b size:', b.size())
            print(traceback.print_exc())

In [18]:
class PredictionLayer(nn.Module):
    def __init__(self, p_size, q_size):
        super().__init__()
        self.linear = nn.Linear(q_size, p_size)

    def forward(self, paragraph, question, paragraph_mask):
        Wq = self.linear(question)
        pWq = paragraph.bmm(Wq.unsqueeze(2)).squeeze(2)
        pWq = pWq.masked_fill(paragraph_mask == 1, -float('inf'))
        return pWq

In [19]:
def fixate_embedding(grad):
    grad[1000:] = 0
    return grad

class DocumentReader(nn.Module):
    def __init__(self, hidden_size, embedding_size, nlayers, dropout, device):
        super().__init__()
        self.device = device

        self.word_embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(word_emb_table).to(device), freeze=False)
        self.word_embedding_layer.weight.register_hook(fixate_embedding)
        # print(embedding_size)
        self.aligned_embedding_layer = AlignedQuestionEmbedding(embedding_size)
        # All encodings
        self.paragraph_lstm = MultiLayerBiLSTM(embedding_size * 2 + 6, hidden_size, nlayers, dropout)
        # 2 encodings (word embedding + aligned question embedding)
        # self.paragraph_lstm = MultiLayerBiLSTM(embedding_size * 2, hidden_size, nlayers, dropout)

        self.question_encoder = QuestionEncoding(embedding_size, hidden_size, nlayers, dropout)

        self.prediction_layer_start = PredictionLayer(hidden_size * nlayers * 2,
                                                          hidden_size * nlayers * 2)
        self.prediction_layer_end = PredictionLayer(hidden_size * nlayers * 2,
                                                        hidden_size * nlayers * 2)

        self.dropout = nn.Dropout(dropout)
    def forward(self, paragraph, question, paragraph_mask, question_mask, match_origin, match_lower, match_lemma, ner, pos, tf):
        # embeddings
        p_word_embedding = self.word_embedding_layer(paragraph)
        q_word_embedding = self.word_embedding_layer(question)
        p_word_embedding = self.dropout(p_word_embedding)
        q_word_embedding = self.dropout(q_word_embedding)
        aligned_embedding = self.aligned_embedding_layer(p_word_embedding, q_word_embedding, question_mask)
        paragraph_embeddings = torch.cat([p_word_embedding, aligned_embedding, match_origin,
                                           match_lower, match_lemma, ner, pos, tf], dim=2)

        # paragraph_embeddings = torch.cat([em_embedding.to(device), p_word_embedding.to(device), aligned_embedding.to(device)], dim=2)
        paragraph_encoding = self.paragraph_lstm(paragraph_embeddings)
        # print(question.size(), question_mask.size())
        question_encoding = self.question_encoder(q_word_embedding, question_mask)

        prediction_start = self.prediction_layer_start(paragraph_encoding, question_encoding, paragraph_mask)
        prediction_end = self.prediction_layer_end(paragraph_encoding, question_encoding, paragraph_mask)

        return prediction_start, prediction_end

In [20]:
HIDDEN_SIZE = 128
EMB_SIZE = 300
NLAYERS = 3
DROPOUT = 0.4
# device = torch.device('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DocumentReader(HIDDEN_SIZE,
                       EMB_SIZE,
                       NLAYERS,
                       DROPOUT,
                       device).to(device)

In [21]:
# dataiter = iter(trainloader)
# dataiter_next = dataiter.next()
# print(dataiter_next)
# (p, q, a, s, p_mask, q_mask) = dataiter.next()
# writer.add_graph(model, p, p_mask, q_mask)
# writer.close()

In [22]:
optimizer = torch.optim.Adamax(model.parameters())

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min', factor=0.1, patience=2, verbose=True)

In [23]:
from time import time
def train(model, train_dataset):
    '''
    Trains the model.
    '''

    print("Start training ........")

    train_loss = 0.

    # put the model in training mode
    model.train()
    start_time = time()
    # iterate through training data
    try:
        for i, (paragraphs, questions, span_list, answer_list,
                paragraph_mask, question_mask, match_origin, match_lower, match_lemma, ner, pos, tf) in enumerate(train_dataset):
            if i % 500 == 0:
                print(f"Starting batch: {i}, time: {time() - start_time}")


            # place the tensors on GPU
            paragraphs = paragraphs.to(device)
            paragraph_mask = paragraph_mask.to(device)
            questions = questions.to(device)
            question_mask = question_mask.to(device)
            match_origin = match_origin.to(device)
            match_lower = match_lower.to(device)
            match_lemma = match_lemma.to(device)
            ner = ner.to(device)
            pos = pos.to(device)
            tf = tf.to(device)

            # forward pass, get the predictions
            preds = model(paragraphs, questions, paragraph_mask, question_mask,
                          match_origin, match_lower, match_lemma, ner, pos, tf)

            start_pred, end_pred = preds

            # print('preds:', start_pred, end_pred)
            # separate labels for start and end position
            span_start = []
            span_end = []
            for span in span_list:
                span_start.append(span[0][0].item())
                span_end.append(span[0][1].item())

            # print('span:', span_start, span_end)
            span_start = torch.LongTensor(span_start).to(device)
            span_end = torch.LongTensor(span_end).to(device)
            # calculate loss
            loss = F.cross_entropy(start_pred, span_start) + F.cross_entropy(end_pred, span_end)

            # backward pass, calculates the gradients
            loss.backward()

            # gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 10)

            # update the gradients
            optimizer.step()
            # zero the gradients to prevent them from accumulating
            optimizer.zero_grad()

            train_loss += loss.item()
    except Exception as e:
        print(f'i:{i}')
        print(f'Types of paragraphs, questions, span_list, answer_list, '
              f'paragraph_mask, question_mask, match_features, ner, pos, tf, match_features[0]:'
              f'{type(paragraphs)}, {type(questions)}, {type(span_list)}, {type(answer_list)},'
              f'{type(paragraph_mask)}, {type(question_mask)}, {type(match_origin)}, {type(ner)},'
              f'{type(pos)}, {type(tf)}, ')
        print(f'Size of paragraphs, questions, '
              f'paragraph_mask, question_mask, match_features, ner, pos, tf, match_features[0]:'
              f'{paragraphs.size()}, {questions.size()},'
              f'{(paragraph_mask.size())}, {(question_mask.size())}, {(match_origin.size())}, {(match_lower.size())}, {(match_lemma.size())}'
              f'{(ner.size())}, {(pos.size())}, {(tf.size())}')
        print(f'sizes of pred:{start_pred.size()} / span:{span_start.size()}')
        print(f'span_start: {span_start[23]}\nspan_end: {span_end[23]}')
        print(f'i: {i}')
        print(f'paragraph: {paragraphs}')
        bad_p = paragraphs.numpy()[23]
        bad_q = questions.numpy()[23]
        bad_p_text = [vocab.itos[pi] for pi in bad_p]
        bad_q_text = [vocab.itos[qi] for qi in bad_q]
        bad_p_text = ' '.join(bad_p_text)
        bad_q_text = ' '.join(bad_q_text)

        print(bad_p_text)
        print(bad_q_text)
        print(f'paragraph size: {paragraphs.size()}, question size: {questions.size()}')
        print(traceback.format_exc())


    return train_loss / len(train_dataset)

In [24]:
def validate(model, test_dataset):
    '''
    Validates the model.
    '''

    print("Start validation ........")

    val_loss = 0.
    emScore = 0
    f1Score = 0
    # put the model in eval mode
    model.eval()
    start_time = time()
    # iterate through training data
    for i, (paragraphs, questions, span_list, answer_list,
            paragraph_mask, question_mask, match_origin, match_lower, match_lemma, ner, pos, tf) in enumerate(test_dataset):
        if i % 500 == 0:
            print(f"Starting batch: {i}, time: {time() - start_time}")


        # place the tensors on GPU
        paragraphs = paragraphs.to(device)
        paragraph_mask = paragraph_mask.to(device)
        questions = questions.to(device)
        question_mask = question_mask.to(device)
        match_origin = match_origin.to(device)
        match_lower = match_lower.to(device)
        match_lemma = match_lemma.to(device)
        ner = ner.to(device)
        pos = pos.to(device)
        tf = tf.to(device)

        # forward pass, get the predictions
        preds = model(paragraphs, questions, paragraph_mask, question_mask,
                      match_origin, match_lower, match_lemma, ner, pos, tf)

        start_pred, end_pred = preds
        # print('preds:', start_pred, end_pred)
        log_softmax = nn.LogSoftmax(dim=1) # batchwise log softmax
        pred_table = log_softmax(start_pred).unsqueeze(2) + log_softmax(end_pred).unsqueeze(1)
        pred_mask1 = (torch.ones_like(pred_table) * -float('inf')).tril(diagonal=-1)# start index <= end index
        pred_mask2 = (torch.ones_like(pred_table) * -float('inf')).triu(diagonal=16)
        pred_table += pred_mask1 + pred_mask2

        start_pred_argmax = []
        end_pred_argmax = []
        paragraph_length = pred_table.shape[-1]
        for batch in pred_table:
            arg_max = batch.argmax()
            start_pred_argmax.append(arg_max // paragraph_length)
            end_pred_argmax.append(arg_max % paragraph_length)

        # separate labels for start and end position
        span_start = []
        span_end = []
        true_answers_list = []
        my_answers = []
        for paragraph, spans, answers, sp, ep in \
                zip(paragraphs, span_list, answer_list, start_pred_argmax, end_pred_argmax):
            span_start.append([span[0].item() for span in spans][:3])
            span_end.append([span[1].item() for span in spans][:3])
            true_answers_list.append([ans2txt(answer) for answer in answers])
            if sp > ep or ep > sp + 15:
                print(f'wrong range, sp:{sp}, ep:{ep} ')
            my_answers.append(span2txt([sp, ep + 1], paragraph))
        with torch.no_grad():
            # print('span:', span_start, span_end)
            try:
                span_start = torch.LongTensor(span_start).to(device)
                span_end = torch.LongTensor(span_end).to(device)
                # calculate loss
                loss = [F.cross_entropy(start_pred, span_start.t()[i]) +
                        F.cross_entropy(end_pred, span_end.t()[i]) for i in range(3)]
                loss = min(loss)

                val_loss += loss.item()

                emScore += em_batch(my_answers, true_answers_list)
                f1Score += f1_batch(my_answers, true_answers_list)
            except:
                print('start pred:', start_pred)
                print('start pred shape:', start_pred.shape)
                print('span_list:', span_list)
                print('span_list length:', len(span_list))
                print('span_start:', span_start)
                print('span_start shape:', np.asarray(span_start).shape)
                print('span_end:', span_end)
                print(traceback.format_exc())

    return val_loss / len(test_dataset), emScore / len(test_dataset), f1Score / len(test_dataset)

In [25]:
from string import punctuation
import unicodedata
def normalize_answer(s):
    s = re.sub(r'\s', ' ', s)
    s = s.lower()
    s = unicodedata.normalize('NFD', s)
    s = s.translate(str.maketrans('','',punctuation))
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    return ' '.join(s.split())

def em_batch(my_answers, true_answers_list):
    # true_answers_list: batch size * 3
    em = 0
    for my_answer, true_answers in zip(my_answers, true_answers_list):
        for true_answer in true_answers:
            if my_answer == true_answer:
                em += 1
                break
    return em / BATCH_SIZE

def f1_batch(my_answers, true_answers_list):
    f1Batch = 0
    for my_answer, true_answers in zip(my_answers, true_answers_list):
        f1_single = 0
        for true_answer in true_answers:
            my_answer_split = my_answer.split()
            true_answer_split = true_answer.split()
            common = Counter(my_answer_split) & Counter(true_answer_split)
            num_intersection = sum(common.values())
            if num_intersection == 0:
                continue
            precision = num_intersection / len(my_answer_split)
            recall = num_intersection / len(true_answer_split)
            f1_single = max((2 * precision * recall) / (precision + recall), f1_single)
        f1Batch += f1_single
        # if f1_single < 0.9:
        #     print('my answer split:', my_answer_split)
        #     print('true answer split:', true_answer_split)
    return f1Batch / BATCH_SIZE

In [26]:
def span2txt(span, paragraph):
    # print(span[0].item())
    my_answer = paragraph[int(span[0].item()) : int(span[1].item()) + 1]
    return ans2txt(my_answer)
def ans2txt(answer):
    words = []
    for a_index in answer:
        words.append(vocab.itos[a_index.item()])
    return normalize_answer(' '.join(words))


In [27]:
import datetime
now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
best_val_loss = 100
path = 'best.pt'
if os.path.isfile(path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
else:
    epoch = 0

In [28]:
train_losses = []
val_losses = []
em_scores = []
f1_scores = []
epoch_start = max(epoch, 0)
for epoch in range(epoch_start, epoch_start + 50):
    print(f'Staring epoch {epoch}')
    start_time = time()

    train_loss = train(model, trainloader)
    val_loss, emScore, f1Score = validate(model, testloader)
    scheduler.step(val_loss)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    em_scores.append(emScore * 100)
    f1_scores.append(f1Score * 100)

    if best_val_loss > val_loss:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': val_loss,
            }, path)
    end_time = time()

    time_elapsed = end_time - start_time
    print(f'train_loss: {train_loss}, val_loss: {val_loss}')
    print(f'em_score: {emScore * 100}, f1_score: {f1Score * 100}')
    print(f'End epoch {epoch}, elapsed time: {time_elapsed}')

Staring epoch 0
Start training ........
Starting batch: 0, time: 0.01696467399597168
Starting batch: 500, time: 95.38166618347168
Starting batch: 1000, time: 183.31170916557312
Starting batch: 1500, time: 276.1832904815674
Starting batch: 2000, time: 365.23765325546265
Starting batch: 2500, time: 452.24517130851746
Start validation ........
Starting batch: 0, time: 0.008034229278564453
train_loss: 5.06880079396279, val_loss: 3.590098239280082
em_score: 30.176158301158303, f1_score: 58.076744179725324
End epoch 0, elapsed time: 494.2568910121918
Staring epoch 1
Start training ........
Starting batch: 0, time: 0.012033939361572266
Starting batch: 500, time: 89.81051254272461
Starting batch: 1000, time: 174.71850204467773
Starting batch: 1500, time: 261.39082884788513
Starting batch: 2000, time: 347.37371349334717
Starting batch: 2500, time: 434.4192576408386
Start validation ........
Starting batch: 0, time: 0.008037090301513672
train_loss: 3.925398392679099, val_loss: 3.2061767923325646

KeyboardInterrupt: 

In [None]:
import csv
with open('result.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(train_losses)
    writer.writerow(val_losses)
    writer.writerow(em_scores)
    writer.writerow(f1_scores)

In [None]:
# val_loss, emScore, f1Score = validate(model, testloader)
# print(f'train_loss: {train_loss}, val_loss: {val_loss}')
# print(f'em_score: {emScore * 100}, f1_score: {f1Score * 100}')

In [None]:
plt.plot([i for i in range(epoch - len(train_losses) + 1, epoch + 1)], train_losses)
plt.plot([i for i in range(epoch - len(val_losses) + 1, epoch + 1)], val_losses)
plt.xlabel('Epochs')
plt.ylabel('CE losses')
plt.title('Training Result')
plt.legend(['Train', 'Test'])
plt.show()

In [None]:
plt.plot([i for i in range(epoch - len(em_scores) + 1, epoch + 1)], em_scores)
plt.plot([i for i in range(epoch - len(f1_scores) + 1, epoch + 1)], f1_scores)
plt.xlabel('Epochs')
plt.ylabel('Scores')
plt.title('Scores on SQuAD 1.1')
plt.legend(['EM', 'F1'])
plt.xticks([i for i in range(1, len(f1_scores) + 1)])
plt.yticks(np.arange(15, 70, 5))
plt.savefig(f"best{epoch - len(f1_scores) + 1}-{epoch}.png", dpi=350)
plt.grid(True)
plt.show()
