In [1471]:
# Default tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word_2_index = {"PAD":PAD_token, "SOS":SOS_token, "EOS":EOS_token} # words converted to index
        self.word_2_count = {} # unique word count
        self.index_2_word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD
    
    def build_vocab(self, data_set):
        for s in data_set:
            self.add_sentence(s)
            
    def add_sentence(self, sentence):
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word_2_index:
            self.word_2_index[word] = self.num_words
            self.word_2_count[word] = 1
            self.index_2_word[self.num_words] = word
            self.num_words += 1
        else:
            self.word_2_count[word] += 1    

In [1472]:
import nltk
import numpy as np
import pandas as pd
import gzip
import re
from torchtext import datasets
from nltk.tokenize import RegexpTokenizer
# from nltk.stem import PorterStemme
import random
import math
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
MAX_LENGTH = 10

def loadDF(data_iter): # loading the dataset into Pandas Data frame for processing.
    data = {"Question": list(), "Answer": list()}
    for _, question, answer, _ in data_iter:
        if len(question) != 0 and len(answer[0]) != 0:
            data["Question"].append(question)
            data["Answer"].append(answer[0])
    df = pd.DataFrame(data)
    return df

def prepare_text(sentence): # the text is cleaned using tokenizer
    
    # using a stemmer to reduce the number of words that serve as an input
    # Lowercase, and remove non-letter characters
    sentence = sentence.lower()
    sentence = re.sub(r"([.!?])", r" \1", sentence)
    sentence = re.sub(r"\s+", r" ", sentence).strip()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    
    new_tokens = [token for token in tokens if not token.isdigit()]
    return new_tokens

def filter(temp):
    filtered_q = list()
    filtered_a = list()
    
    q = temp["Question"].to_list()
    a = temp["Answer"].to_list()
    
    for i in range(0, len(q)):
        temp_q = len(q[i])
        temp_a = len(a[i])
        if temp_q < MAX_LENGTH and temp_a < MAX_LENGTH:
            filtered_q.append(q[i])
            filtered_a.append(a[i])
    
    return filtered_q, filtered_a


In [1516]:
# loading data using torch text
train_iter, test_iter = datasets.SQuAD2("./data/SQuAD2", split=("train", "dev"))
train_df = loadDF(train_iter).iloc[:20000]
test_df = loadDF(test_iter).iloc[:500]

# prepare_text function is applied to every sentence
train_df["Question"] = train_df["Question"].apply(prepare_text) 
train_df["Answer"] = train_df["Answer"].apply(prepare_text)

test_df["Question"] = test_df["Question"].apply(prepare_text)
test_df["Answer"] = test_df["Answer"].apply(prepare_text)

a , b = filter(train_df)
print(len(a))
print("\n")
print(len(b))


src_vocab = Vocab("Question_Vocab")
src_vocab.build_vocab(a)

trg_vocab = Vocab("Answer_Vocab")
trg_vocab.build_vocab(b)

9097


9097


In [1474]:
print(src_vocab.num_words)
print(trg_vocab.num_words)

8722
7181


In [1475]:
print(len(train_df["Question"]) // 128) 
print(1000 // 128)



156
7


In [1477]:
def indexify_padding(temp, vocab): # temp - lists, vocab - object different for q(src) and a(trg)
#     print(vocab.name)
    new_lists = list()
    max_len = 0
    for s in temp:
        temp_l = len(s)
        if temp_l > max_len:
            max_len = temp_l
            
    max_len += 2 # to add <sos> and <eos> tokens
    for l in temp:
        new_l = [vocab.word_2_index[word] for word in l]
        new_l.insert(0, SOS_token) # adding <sos> token
        new_l.append(EOS_token) # adding <eos> token
        
        while len(new_l) < max_len: # adding <pad> tokens to balance
            new_l.append(PAD_token)
            
        new_lists.append(new_l)
    
    return new_lists


#batch_size = 128
def get_batches(src, trg, batch_size=128,src_vocab=src_vocab,trg_vocab=trg_vocab):
    n_batches = len(src) // batch_size # integer division
    for i in range(0, n_batches):
        src_batch = src[i*batch_size:(batch_size*(i+1))]
        src_batch = np.array(indexify_padding(src_batch, src_vocab)).T
        
        trg_batch = trg[i*batch_size:(batch_size*(i+1))]
        trg_batch = np.array(indexify_padding(trg_batch, trg_vocab)).T
        
        yield src_batch, trg_batch, n_batches
        
        
# SRC = train_df["Question"].to_list()
# TRG = train_df["Answer"].to_list()

SRC, TRG = filter(train_df)
test_SRC, test_TRG = filter(test_df)

test_src_vocab = Vocab("Test src")
test_src_vocab.build_vocab(test_SRC)

test_trg_vocab = Vocab("Test trg")
test_trg_vocab.build_vocab(test_TRG)


In [1478]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers, dropout):
        
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.embedding_dim = embedding_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(self.input_size, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_size, n_layers, dropout=dropout) 
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, i):
        
        embedding = self.dropout(self.embedding(i))
        o, (h,c) = self.lstm(embedding)      
        
        return h, c
    

class Decoder(nn.Module):
      
    def __init__(self, output_size, embedding_size,hidden_size, n_layers, dropout):
        
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers
        
        # The LSTM produces an output by passing the hidden state to the   Linear layer
        self.embedding = nn.Embedding(self.output_size, self.embedding_size)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, i, h, c): # i - input, h - hidden state, c - cell state
        
        i = i.unsqueeze(0)
        embedding = self.dropout(self.embedding(i))
        o, (h, c) = self.lstm(embedding, (h, c))
        o = self.out(o.squeeze(0)) # o - prediction
        o = self.softmax(o)
        
        return o, h, c
        
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, device):
        
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hidden_size == decoder.hidden_size, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
    
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):  
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) 
            
            input = trg[t] if teacher_force else top1
            
        return outputs

    



In [1479]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIM = src_vocab.num_words
OUTPUT_DIM = trg_vocab.num_words
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [1480]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(8722, 256)
    (lstm): LSTM(256, 512, num_layers=3, dropout=0.2)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7181, 256)
    (lstm): LSTM(256, 512, num_layers=3, dropout=0.2)
    (out): Linear(in_features=512, out_features=7181, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (softmax): LogSoftmax(dim=1)
  )
)

In [1481]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 19,313,933 trainable parameters


In [1482]:
optimizer = torch.optim.Adam(model.parameters())
# optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
criterion =  nn.CrossEntropyLoss(ignore_index = PAD_token)
# criterion = nn.NLLLoss(ignore_index=PAD_token)

In [1483]:
def train(model, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for src, trg, n_batches in get_batches(SRC, TRG):
        
        src = torch.from_numpy(src)
        trg = torch.from_numpy(trg)
    
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        
        # flattening and getting rid of <sos> and 0 in trg and output respectively
        output = output[1:].view(-1, output.shape[-1]) 
        trg = trg[1:].contiguous().view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / n_batches

In [1484]:
def evaluate(model, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for src, trg, n_batches in get_batches(test_SRC, test_TRG, 128, test_src_vocab, test_trg_vocab):
            src = torch.from_numpy(src)
            trg = torch.from_numpy(trg)

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].contiguous().view(-1)
            
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / n_batches

In [1485]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [1486]:
N_EPOCHS = 1000
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, criterion)

    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


Epoch: 01 | Time: 0m 28s
	Train Loss: 6.471 | Train PPL: 646.381
	 Val. Loss: 6.315 |  Val. PPL: 552.570
Epoch: 02 | Time: 0m 26s
	Train Loss: 5.927 | Train PPL: 374.988
	 Val. Loss: 6.214 |  Val. PPL: 499.730
Epoch: 03 | Time: 0m 26s
	Train Loss: 5.891 | Train PPL: 361.747
	 Val. Loss: 6.314 |  Val. PPL: 552.391
Epoch: 04 | Time: 0m 26s
	Train Loss: 5.885 | Train PPL: 359.509
	 Val. Loss: 6.422 |  Val. PPL: 615.207
Epoch: 05 | Time: 0m 26s
	Train Loss: 5.792 | Train PPL: 327.574
	 Val. Loss: 6.348 |  Val. PPL: 571.355
Epoch: 06 | Time: 0m 26s
	Train Loss: 5.717 | Train PPL: 303.923
	 Val. Loss: 6.385 |  Val. PPL: 592.819
Epoch: 07 | Time: 0m 26s
	Train Loss: 5.683 | Train PPL: 293.784
	 Val. Loss: 6.419 |  Val. PPL: 613.354
Epoch: 08 | Time: 0m 26s
	Train Loss: 5.658 | Train PPL: 286.564
	 Val. Loss: 6.479 |  Val. PPL: 651.420
Epoch: 09 | Time: 0m 26s
	Train Loss: 5.640 | Train PPL: 281.452
	 Val. Loss: 6.497 |  Val. PPL: 662.988
Epoch: 10 | Time: 0m 26s
	Train Loss: 5.593 | Train PPL

Epoch: 79 | Time: 0m 26s
	Train Loss: 2.653 | Train PPL:  14.195
	 Val. Loss: 10.067 |  Val. PPL: 23551.080
Epoch: 80 | Time: 0m 26s
	Train Loss: 2.585 | Train PPL:  13.263
	 Val. Loss: 10.147 |  Val. PPL: 25510.510
Epoch: 81 | Time: 0m 28s
	Train Loss: 2.639 | Train PPL:  13.994
	 Val. Loss: 10.105 |  Val. PPL: 24473.652
Epoch: 82 | Time: 0m 27s
	Train Loss: 2.507 | Train PPL:  12.263
	 Val. Loss: 10.171 |  Val. PPL: 26136.828
Epoch: 83 | Time: 0m 27s
	Train Loss: 2.686 | Train PPL:  14.669
	 Val. Loss: 10.247 |  Val. PPL: 28189.210
Epoch: 84 | Time: 0m 26s
	Train Loss: 2.571 | Train PPL:  13.074
	 Val. Loss: 10.202 |  Val. PPL: 26954.749
Epoch: 85 | Time: 0m 27s
	Train Loss: 2.482 | Train PPL:  11.960
	 Val. Loss: 10.364 |  Val. PPL: 31684.733
Epoch: 86 | Time: 0m 26s
	Train Loss: 2.457 | Train PPL:  11.668
	 Val. Loss: 10.314 |  Val. PPL: 30150.024
Epoch: 87 | Time: 0m 26s
	Train Loss: 2.446 | Train PPL:  11.543
	 Val. Loss: 10.390 |  Val. PPL: 32544.004
Epoch: 88 | Time: 0m 26s
	Tr

Epoch: 154 | Time: 0m 26s
	Train Loss: 0.768 | Train PPL:   2.156
	 Val. Loss: 13.808 |  Val. PPL: 992087.554
Epoch: 155 | Time: 0m 26s
	Train Loss: 0.719 | Train PPL:   2.052
	 Val. Loss: 13.892 |  Val. PPL: 1079464.241
Epoch: 156 | Time: 0m 26s
	Train Loss: 0.829 | Train PPL:   2.291
	 Val. Loss: 13.890 |  Val. PPL: 1077852.286
Epoch: 157 | Time: 0m 26s
	Train Loss: 0.812 | Train PPL:   2.253
	 Val. Loss: 13.941 |  Val. PPL: 1133952.963
Epoch: 158 | Time: 0m 26s
	Train Loss: 1.013 | Train PPL:   2.755
	 Val. Loss: 13.855 |  Val. PPL: 1040430.840
Epoch: 159 | Time: 0m 26s
	Train Loss: 0.856 | Train PPL:   2.353
	 Val. Loss: 13.925 |  Val. PPL: 1115376.578
Epoch: 160 | Time: 0m 26s
	Train Loss: 0.762 | Train PPL:   2.142
	 Val. Loss: 13.871 |  Val. PPL: 1056807.179
Epoch: 161 | Time: 0m 26s
	Train Loss: 0.707 | Train PPL:   2.028
	 Val. Loss: 14.191 |  Val. PPL: 1455588.479
Epoch: 162 | Time: 0m 27s
	Train Loss: 0.719 | Train PPL:   2.051
	 Val. Loss: 14.107 |  Val. PPL: 1338009.029
Ep

KeyboardInterrupt: 

In [1508]:
print("Question: ", a[0])
print("Answer/Ground Truth: ",b[0])

query = [vocab.word_2_index[word] for word in a[0]]
query.insert(0, SOS_token) 
query.append(EOS_token)

response = [0 for word in a[0]]
response.insert(0, SOS_token) 
response.append(EOS_token)

q_temp = np.array([query]).T
src = torch.from_numpy(q_temp)
r_temp = np.array([response]).T
trg = torch.from_numpy(r_temp)

model.eval()
with torch.no_grad():
    outputs = model(src, trg, teacher_forcing_ratio=0)

outputs.shape

output_idx = outputs[1:].squeeze(1).argmax(1)
l = output_idx.numpy().tolist()
x = ' '.join([trg_vocab.index_2_word[idx] for idx in l])
print(x)

Question:  ['when', 'did', 'beyonce', 'start', 'becoming', 'popular']
Answer/Ground Truth:  ['in', 'the', 'late', '1990s']
march EOS EOS EOS EOS EOS EOS


In [1517]:
print("Question: ", a[100])
print("Answer/Ground Truth: ",b[100])

query = [vocab.word_2_index[word] for word in a[100]]
query.insert(0, SOS_token) 
query.append(EOS_token)

response = [0 for word in a[100]]
response.insert(0, SOS_token) 
response.append(EOS_token)

q_temp = np.array([query]).T
src = torch.from_numpy(q_temp)
r_temp = np.array([response]).T
trg = torch.from_numpy(r_temp)

model.eval()
with torch.no_grad():
    outputs = model(src, trg, teacher_forcing_ratio=0)

outputs.shape

output_idx = outputs[1:].squeeze(1).argmax(1)
l = output_idx.numpy().tolist()
x = ' '.join([trg_vocab.index_2_word[idx] for idx in l])
print(x)

Question:  ['where', 'did', 'beyonce', 'perform', 'in']
Answer/Ground Truth:  ['glastonbury', 'festival']
y adult contemporary EOS EOS EOS


In [1518]:
print("Question: ", a[101])
print("Answer/Ground Truth: ",b[101])

query = [vocab.word_2_index[word] for word in a[101]]
query.insert(0, SOS_token) 
query.append(EOS_token)

response = [0 for word in a[101]]
response.insert(0, SOS_token) 
response.append(EOS_token)

q_temp = np.array([query]).T
src = torch.from_numpy(q_temp)
r_temp = np.array([response]).T
trg = torch.from_numpy(r_temp)

model.eval()
with torch.no_grad():
    outputs = model(src, trg, teacher_forcing_ratio=0)

outputs.shape

output_idx = outputs[1:].squeeze(1).argmax(1)
l = output_idx.numpy().tolist()
x = ' '.join([trg_vocab.index_2_word[idx] for idx in l])
print(x)

Question:  ['who', 'did', 'beyoncé', 'perform', 'privately', 'for', 'in']
Answer/Ground Truth:  ['muammar', 'gaddafi']
new ivy campaign EOS EOS EOS EOS EOS
