In [1]:
! git clone https://github.com/deepmind/narrativeqa.git

fatal: destination path 'narrativeqa' already exists and is not an empty directory.


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/content/narrativeqa/qaps.csv')
print(df.shape)
df.head()

(46765, 8)


Unnamed: 0,document_id,set,question,answer1,answer2,question_tokenized,answer1_tokenized,answer2_tokenized
0,0025577043f5090cd603c6aea60f26e236195594,test,Who is Mark Hunter?,He is a high school student in Phoenix.,A loner and outsider student with a radio stat...,Who is Mark Hunter ?,He is a high school student in Phoenix .,A loner and outsider student with a radio stat...
1,0025577043f5090cd603c6aea60f26e236195594,test,Where does this radio station take place?,It takes place in Mark's parents basement.,"Phoenix, Arizona",Where does this radio station take place ?,It takes place in Mark s parents basement .,"Phoenix , Arizona"
2,0025577043f5090cd603c6aea60f26e236195594,test,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...,Why do more students tune into Mark s show ?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...
3,0025577043f5090cd603c6aea60f26e236195594,test,Who commits suicide?,Malcolm.,Malcolm.,Who commits suicide ?,Malcolm .,Malcolm .
4,0025577043f5090cd603c6aea60f26e236195594,test,What does Paige jam into her microwave?,She jams her medals and accolades.,Her award medals,What does Paige jam into her microwave ?,She jams her medals and accolades .,Her award medals


In [4]:
df = df[['question', 'answer1']]
df.head()

Unnamed: 0,question,answer1
0,Who is Mark Hunter?,He is a high school student in Phoenix.
1,Where does this radio station take place?,It takes place in Mark's parents basement.
2,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...
3,Who commits suicide?,Malcolm.
4,What does Paige jam into her microwave?,She jams her medals and accolades.


In [5]:
! python3 -m spacy download en
# ! python3 -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.data import Field, BucketIterator
from torchtext import data

import spacy
import numpy as np

import random
import math
import time
from tqdm import tqdm

In [7]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [8]:
spacy_en = spacy.load('en')

In [9]:
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [10]:
SRC = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)
TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

In [11]:
fields = [('question', SRC), ('answer1', TRG)]

In [12]:
example = [data.Example.fromlist([df.question[i],df.answer1[i]], fields) for i in range(df.shape[0])] 

In [13]:
NQDataset = data.Dataset(example, fields)

In [14]:
print(vars(NQDataset.examples[0]))

{'question': ['who', 'is', 'mark', 'hunter', '?'], 'answer1': ['he', 'is', 'a', 'high', 'school', 'student', 'in', 'phoenix', '.']}


In [15]:
train_data, valid_data, test_data = NQDataset.split(split_ratio=[0.7, 0.2, 0.1], random_state=random.seed(SEED))

In [16]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort=False,
    batch_size = BATCH_SIZE,
    device = device
)

In [19]:
# batch = next(iter(train_iterator))

In [20]:
# x = batch.trg[0]
# x

In [21]:
# TRG.vocab.stoi['good']

In [22]:
# x.unsqueeze(1)

In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) 
            
            input = trg[t] if teacher_force else top1

        return outputs

In [24]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        
        embedded = self.dropout(self.embedding(src))        
        outputs, hidden = self.rnn(embedded)      
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        return outputs, hidden

In [25]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        attention = self.v(energy).squeeze(2)
                
        return F.softmax(attention, dim=1)

In [26]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        
        input = input.unsqueeze(0)
                
        embedded = self.dropout(self.embedding(input))
                
        a = self.attention(hidden, encoder_outputs)          
        a = a.unsqueeze(1)
                
        encoder_outputs = encoder_outputs.permute(1, 0, 2)        
        weighted = torch.bmm(a, encoder_outputs)        
        weighted = weighted.permute(1, 0, 2)
                
        rnn_input = torch.cat((embedded, weighted), dim = 2)
                    
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
                
        return prediction, hidden.squeeze(0)

In [27]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(SRC.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [28]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(11001, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(11001, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=11001, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [29]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 31,790,585 trainable parameters


In [30]:
optimizer = optim.Adam(model.parameters())

In [31]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [32]:
def train(model, train_iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm(train_iterator)):
        src, trg = batch.question, batch.answer1
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(train_iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(tqdm(iterator)):

            src = batch.question
            trg = batch.answer1

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [34]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

100%|██████████| 256/256 [01:30<00:00,  2.82it/s]
100%|██████████| 37/37 [00:04<00:00,  7.72it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 01 | Time: 1m 35s
	Train Loss: 5.770 | Train PPL: 320.439
	 Val. Loss: 5.342 |  Val. PPL: 208.911


100%|██████████| 256/256 [01:34<00:00,  2.72it/s]
100%|██████████| 37/37 [00:04<00:00,  7.62it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 02 | Time: 1m 38s
	Train Loss: 5.191 | Train PPL: 179.698
	 Val. Loss: 5.231 |  Val. PPL: 186.963


100%|██████████| 256/256 [01:35<00:00,  2.68it/s]
100%|██████████| 37/37 [00:04<00:00,  7.48it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 03 | Time: 1m 40s
	Train Loss: 4.843 | Train PPL: 126.883
	 Val. Loss: 5.216 |  Val. PPL: 184.259


100%|██████████| 256/256 [01:36<00:00,  2.64it/s]
100%|██████████| 37/37 [00:04<00:00,  7.45it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 04 | Time: 1m 41s
	Train Loss: 4.460 | Train PPL:  86.461
	 Val. Loss: 5.284 |  Val. PPL: 197.200


100%|██████████| 256/256 [01:36<00:00,  2.66it/s]
100%|██████████| 37/37 [00:04<00:00,  7.51it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 05 | Time: 1m 41s
	Train Loss: 3.999 | Train PPL:  54.569
	 Val. Loss: 5.401 |  Val. PPL: 221.625


100%|██████████| 256/256 [01:36<00:00,  2.65it/s]
100%|██████████| 37/37 [00:04<00:00,  7.50it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 06 | Time: 1m 41s
	Train Loss: 3.465 | Train PPL:  31.974
	 Val. Loss: 5.629 |  Val. PPL: 278.290


100%|██████████| 256/256 [01:36<00:00,  2.66it/s]
100%|██████████| 37/37 [00:04<00:00,  7.53it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 07 | Time: 1m 41s
	Train Loss: 2.986 | Train PPL:  19.800
	 Val. Loss: 5.786 |  Val. PPL: 325.837


100%|██████████| 256/256 [01:35<00:00,  2.67it/s]
100%|██████████| 37/37 [00:04<00:00,  7.53it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 08 | Time: 1m 40s
	Train Loss: 2.558 | Train PPL:  12.910
	 Val. Loss: 5.969 |  Val. PPL: 390.970


100%|██████████| 256/256 [01:36<00:00,  2.67it/s]
100%|██████████| 37/37 [00:04<00:00,  7.48it/s]
  0%|          | 0/256 [00:00<?, ?it/s]

Epoch: 09 | Time: 1m 40s
	Train Loss: 2.246 | Train PPL:   9.447
	 Val. Loss: 6.146 |  Val. PPL: 466.952


100%|██████████| 256/256 [01:37<00:00,  2.64it/s]
100%|██████████| 37/37 [00:04<00:00,  7.53it/s]

Epoch: 10 | Time: 1m 42s
	Train Loss: 2.000 | Train PPL:   7.389
	 Val. Loss: 6.249 |  Val. PPL: 517.732



