In [1]:
# ! wget https://nlp.cs.washington.edu/ambigqa/data/ambignq_light.zip
# ! unzip ambignq_light.zip

In [2]:
import json

In [3]:
file = open('train_light.json')
data_ = json.load(file)

In [4]:
print(data_[0])

{'annotations': [{'type': 'multipleQAs', 'qaPairs': [{'question': 'When did the Simpsons first air on television as an animated short on the Tracey Ullman Show?', 'answer': ['April 19, 1987']}, {'question': 'When did the Simpsons first air as a half-hour prime time show?', 'answer': ['December 17, 1989']}]}], 'id': '-4469503464110108318', 'question': 'When did the simpsons first air on television?'}


In [5]:
que, ans = [], []
for item in data_:
    for anot in item['annotations']:
        if anot['type'] == 'multipleQAs':
            for pair in anot['qaPairs']:
                for answ in pair['answer']:
                    que.append(pair['question'])
                    ans.append(answ)
        else:
            for answ in anot['answer']:
                que.append(item['question'])
                ans.append(answ)

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(list(zip(que, ans)), columns =['questions', 'answers']) 

In [8]:
print(df.shape)
df = df.head(10000)

(27822, 2)


In [9]:
! python3 -m spacy download en
# ! python3 -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.data import Field, BucketIterator
from torchtext import data

import spacy
import numpy as np

import random
import math
import time
from tqdm import tqdm

In [11]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [12]:
spacy_en = spacy.load('en')

In [13]:
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [14]:
SRC = Field(sequential = True, tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)
TRG = Field(sequential = True, tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

In [15]:
fields = [('que', SRC), ('ans', TRG)]

In [16]:
example = [data.Example.fromlist([df.questions[i],df.answers[i]], fields) for i in range(df.shape[0])] 

In [17]:
AmbiQDataset = data.Dataset(example, fields)

In [18]:
print(vars(AmbiQDataset.examples[0]))

{'que': ['when', 'did', 'the', 'simpsons', 'first', 'air', 'on', 'television', 'as', 'an', 'animated', 'short', 'on', 'the', 'tracey', 'ullman', 'show', '?'], 'ans': ['april', '19', ',', '1987']}


In [19]:
train_data, valid_data, test_data = AmbiQDataset.split(split_ratio=[0.7, 0.2, 0.1], random_state=random.seed(SEED))

In [20]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort=False,
    batch_size = BATCH_SIZE,
    device = device
)

In [23]:
# batch = next(iter(train_iterator))

In [24]:
# x = batch.trg[0]
# x

In [25]:
# TRG.vocab.stoi['good']

In [26]:
# x.unsqueeze(1)

In [27]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs  


In [28]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell



In [29]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(0))
        return prediction, hidden, cell



In [30]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [31]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4427, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2558, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=2558, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [32]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,456,830 trainable parameters


In [33]:
optimizer = optim.Adam(model.parameters())

In [34]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [35]:
def train(model, train_iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm(train_iterator)):
        src, trg = batch.que, batch.ans
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(train_iterator)

In [36]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(tqdm(iterator)):

            src = batch.que
            trg = batch.ans

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [37]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [38]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'\nEpoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

100%|██████████| 55/55 [00:03<00:00, 14.41it/s]
100%|██████████| 8/8 [00:00<00:00, 28.97it/s]
  2%|▏         | 1/55 [00:00<00:05,  9.92it/s]


Epoch: 01 | Time: 0m 4s
	Train Loss: 5.116 | Train PPL: 166.623
	 Val. Loss: 4.041 |  Val. PPL:  56.883


100%|██████████| 55/55 [00:03<00:00, 14.53it/s]
100%|██████████| 8/8 [00:00<00:00, 30.25it/s]
  4%|▎         | 2/55 [00:00<00:03, 16.77it/s]


Epoch: 02 | Time: 0m 4s
	Train Loss: 4.670 | Train PPL: 106.725
	 Val. Loss: 3.968 |  Val. PPL:  52.857


100%|██████████| 55/55 [00:03<00:00, 14.80it/s]
100%|██████████| 8/8 [00:00<00:00, 29.76it/s]
  4%|▎         | 2/55 [00:00<00:04, 12.27it/s]


Epoch: 03 | Time: 0m 3s
	Train Loss: 4.544 | Train PPL:  94.080
	 Val. Loss: 3.923 |  Val. PPL:  50.541


100%|██████████| 55/55 [00:03<00:00, 14.06it/s]
100%|██████████| 8/8 [00:00<00:00, 30.59it/s]
  4%|▎         | 2/55 [00:00<00:03, 14.38it/s]


Epoch: 04 | Time: 0m 4s
	Train Loss: 4.401 | Train PPL:  81.557
	 Val. Loss: 3.919 |  Val. PPL:  50.344


100%|██████████| 55/55 [00:03<00:00, 14.59it/s]
100%|██████████| 8/8 [00:00<00:00, 29.65it/s]
  4%|▎         | 2/55 [00:00<00:04, 11.94it/s]


Epoch: 05 | Time: 0m 4s
	Train Loss: 4.291 | Train PPL:  73.072
	 Val. Loss: 3.902 |  Val. PPL:  49.480


100%|██████████| 55/55 [00:03<00:00, 14.47it/s]
100%|██████████| 8/8 [00:00<00:00, 29.39it/s]
  4%|▎         | 2/55 [00:00<00:02, 18.25it/s]


Epoch: 06 | Time: 0m 4s
	Train Loss: 4.219 | Train PPL:  67.944
	 Val. Loss: 3.915 |  Val. PPL:  50.169


100%|██████████| 55/55 [00:03<00:00, 14.96it/s]
100%|██████████| 8/8 [00:00<00:00, 29.53it/s]
  4%|▎         | 2/55 [00:00<00:03, 14.18it/s]


Epoch: 07 | Time: 0m 3s
	Train Loss: 4.155 | Train PPL:  63.781
	 Val. Loss: 3.904 |  Val. PPL:  49.593


100%|██████████| 55/55 [00:03<00:00, 14.47it/s]
100%|██████████| 8/8 [00:00<00:00, 28.54it/s]
  4%|▎         | 2/55 [00:00<00:02, 18.13it/s]


Epoch: 08 | Time: 0m 4s
	Train Loss: 4.097 | Train PPL:  60.159
	 Val. Loss: 3.865 |  Val. PPL:  47.688


100%|██████████| 55/55 [00:03<00:00, 14.35it/s]
100%|██████████| 8/8 [00:00<00:00, 30.19it/s]
  4%|▎         | 2/55 [00:00<00:02, 19.80it/s]


Epoch: 09 | Time: 0m 4s
	Train Loss: 4.022 | Train PPL:  55.807
	 Val. Loss: 3.812 |  Val. PPL:  45.243


100%|██████████| 55/55 [00:03<00:00, 14.55it/s]
100%|██████████| 8/8 [00:00<00:00, 30.34it/s]



Epoch: 10 | Time: 0m 4s
	Train Loss: 3.957 | Train PPL:  52.292
	 Val. Loss: 3.757 |  Val. PPL:  42.822
