In [1]:
!pip install torch torchtext==0.3.1



In [2]:
# References: https://medium.com/@adam.wearne/seq2seq-with-pytorch-46dc00ff5164

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import random

class Encoder(nn.Module):

    def __init__(self, hidden_size, embedding_size,
                 embedding, answer_embedding, lexical_embedding, n_layers, dropout):

        super(Encoder, self).__init__()

        # Initialize network parameters
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Embedding layer to be shared with Decoder
        self.embedding = embedding
        self.answer_embedding = answer_embedding
        self.lexical_embedding = lexical_embedding

        # Bidirectional GRU
        self.gru = nn.GRU(embedding_size, hidden_size,
                          num_layers=n_layers,
                          dropout=dropout,
                          bidirectional=True)

    def forward(self, input_sequence, input_lengths, answer_sequence, lexical_sequence):

        # Convert input_sequence to word embeddings
        word_embeddings = self.embedding(input_sequence)
        answer_embeddings = self.answer_embedding(answer_sequence)
        lexical_embeddings = self.lexical_embedding(lexical_sequence)

        # Concatenate word embeddings from all features
        final_embeddings = torch.cat((word_embeddings,answer_embeddings,lexical_embeddings), 0)

        # Pack the sequence of embeddings
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(final_embeddings, input_lengths)

        # Run the packed embeddings through the GRU, and then unpack the sequences
        outputs, hidden = self.gru(packed_embeddings)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)

        # The ouput of a GRU has shape (seq_len, batch, hidden_size * num_directions)
        # Because the Encoder is bidirectional, combine the results from the
        # forward and reversed sequence by simply adding them together.
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]

        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()

        self.hidden_size = hidden_size

    def dot_score(self, hidden_state, encoder_states):
        # Attention model use the dot product formula as global attention
        return torch.sum(hidden_state * encoder_states, dim=2)

    def forward(self, hidden, encoder_outputs, mask):
        attn_scores = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_scores = attn_scores.t()

        # Apply mask so network does not attend <pad> tokens
        attn_scores = attn_scores.masked_fill(mask == 0, -1e10)

        # Return softmax over attention scores
        return F.softmax(attn_scores, dim=1).unsqueeze(1)

class Decoder(nn.Module):
    def __init__(self, embedding, embedding_size,
                 hidden_size, output_size, n_layers, dropout):

        super(Decoder, self).__init__()

        # Initialize network params
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding

        self.gru = nn.GRU(embedding_size, hidden_size, n_layers,
                          dropout=dropout)

        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attention(hidden_size)

    def forward(self, current_token, hidden_state, encoder_outputs, mask):

        # convert current_token to word_embedding
        embedded = self.embedding(current_token)

        # Pass through GRU
        rnn_output, hidden_state = self.gru(embedded, hidden_state)

        # Calculate attention weights
        attention_weights = self.attn(rnn_output, encoder_outputs, mask)

        # Calculate context vector
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1))

        # Concatenate  context vector and GRU output
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))

        # Pass concat_output to final output layer
        output = self.out(concat_output)

        # Return output and final hidden state
        return output, hidden_state

class Seq2seq(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size,
                 device, pad_idx, eos_idx, sos_idx, teacher_forcing_ratio=0.5):
        super(Seq2seq, self).__init__()

        # Initialize embedding layer shared by encoder and decoder
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.answer_embedding = nn.Embedding(6, embedding_size, padding_idx=1)
        # Size could sometime change, depend on the device that the model is trained on
        self.lexical_embedding = nn.Embedding(452, embedding_size, padding_idx=1)

        # Encoder network
        self.encoder = Encoder(hidden_size,
                               embedding_size,
                               self.embedding,
                               self.answer_embedding,
                               self.lexical_embedding,
                               n_layers=2,
                               dropout=0.5)

        # Decoder network
        self.decoder = Decoder(self.embedding,
                               embedding_size,
                               hidden_size,
                               vocab_size,
                               n_layers=2,
                               dropout=0.5)


        # Indices of special tokens and hardware device
        self.pad_idx = pad_idx
        self.eos_idx = eos_idx
        self.sos_idx = sos_idx
        self.device = device

    def create_mask(self, input_sequence):

        return (input_sequence != self.pad_idx).permute(1, 0)

    def forward(self, input_sequence, answer_sequence, lexical_sequence, output_sequence, teacher_forcing_ratio):

        # Unpack input_sequence tuple
        input_tokens = input_sequence[0]
        input_lengths = input_sequence[1]

        # Unpack output_tokens, or create an empty tensor for text generation
        if output_sequence is None:
            inference = True
            output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            output_tokens = output_sequence[0]

        vocab_size = self.decoder.output_size

        batch_size = len(input_lengths)
        max_seq_len = len(output_tokens)

        # Tensor initialization to store Decoder output
        outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)

        # Pass through the first half of the network
        encoder_outputs, hidden = self.encoder(input_tokens, input_lengths, answer_sequence, lexical_sequence)

        # Ensure dim of hidden_state can be fed into Decoder
        hidden =  hidden[:self.decoder.n_layers]

        # First input to the decoder is the <sos> tokens
        output = output_tokens[0,:]

        # Create mask
        mask = self.create_mask(input_tokens)

        # Step through the length of the output sequence one token at a time
        # Teacher forcing is used to assist training
        for t in range(1, max_seq_len):
            output = output.unsqueeze(0)

            output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (output_tokens[t] if teacher_force else top1)

            # If we're in inference mode, keep generating until we produce an
            # <eos> token
            if inference and output.item() == self.eos_idx:
                return outputs[:t]

        return outputs

In [4]:
%cd ../

/notebooks/automatic-question-generation-master


In [5]:
import os
DIR = os.getcwd()

In [6]:
DIR

'/notebooks/automatic-question-generation-master'

In [7]:
train_set_path = os.path.join(DIR, 'results/resultssquad_train.csv')
dev_set_path = os.path.join(DIR, 'results/resultssquad_dev.csv')
test_size = 0.7
save = os.path.join(DIR, 'dataset')
train_set = os.path.join(DIR, 'dataset')
word_vector = 'glove'
batch_size = 128
numberbatch_loc = os.path.join(DIR, 'dataset')
resume = ''
epochs = 10

In [8]:
# Saving file names to variables
train_set_path = os.path.join(DIR, 'results/resultssquad_train.csv')
trainloc = train_set_path
# trainloc = os.path.join(DIR, 'dataset/sample_train.csv')
valloc = save+'/validation_set.csv'
testloc = save+'/test_set.csv'

In [15]:
# Code adapted from : https://github.com/bentrevett/pytorch-seq2seq
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
from torchtext.vocab import Vectors

from tqdm import tqdm
import random
import pandas as pd
import numpy as np

import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import single_meteor_score
## Wordnet dependencies from meteor score
#nltk.download('wordnet')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data
trainloc = os.path.join(DIR, 'results/resultssquad_train.csv')
valloc = os.path.join(DIR, 'dataset/validation_set.csv')
testloc = os.path.join(DIR, 'dataset/test_set.csv')
resume = os.path.join('models/model_14.pth')

# Create Field object
tokenize = lambda x: x.split()
TEXT = data.Field(tokenize=tokenize, lower=False, include_lengths = True, init_token = '<SOS>', eos_token = '<EOS>')
LEX = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')
BIO = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')

# Specify Fields in the dataset
fields = [('context', TEXT), ('question', TEXT), ('bio', BIO), ('lex', LEX)]

# Build the dataset
train_data, valid_data, test_data = data.TabularDataset.splits(path = '',train=trainloc, validation=valloc,
                                 test=testloc, fields = fields, format='csv', skip_header=True)

# Build vocabulary
MAX_VOCAB_SIZE = 35000
MIN_COUNT = 5
BATCH_SIZE = 128

if 'glove' == 'glove':
  TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE,
                 min_freq=MIN_COUNT, vectors='glove.6B.300d',
                 unk_init=torch.Tensor.normal_)
else:
  cache_ = os.path.join(DIR, 'dataset')
  vectors = Vectors(name='numberbatch-en-19.08.txt', cache=cache_)
  TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE,
                 min_freq=MIN_COUNT, vectors=vectors,
                 unk_init=torch.Tensor.normal_)

BIO.build_vocab(train_data)
LEX.build_vocab(train_data)

# Building model
pad_idx = TEXT.vocab.stoi['<pad>']
eos_idx = TEXT.vocab.stoi['<EOS>']
sos_idx = TEXT.vocab.stoi['<SOS>']

In [16]:
# Size of embedding_dim should match the dim of pre-trained word embeddings
embedding_dim = 300
hidden_dim = 512
vocab_size = len(TEXT.vocab)

# Initializing weights
model = Seq2seq(embedding_dim, hidden_dim, vocab_size, device, pad_idx, eos_idx, sos_idx).to(device)
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Initializing weights for special tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

model.embedding.weight.requires_grad = False

optimizer = optim.Adam([param for param in model.parameters() if param.requires_grad == True],
                       lr=1.0e-3)
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)




In [17]:
# Load model
model.load_state_dict(torch.load(resume))


<All keys matched successfully>

In [None]:

def predict_question(model, paragraph, answer_pos, lex_features):
    model.eval()

    tokenized = ['<SOS>'] + paragraph + ['<EOS>']
    numericalized = [TEXT.vocab.stoi[t] for t in tokenized]

    tokenized_answer = ['<SOS>'] + answer_pos + ['<EOS>']
    numericalized_answer = [BIO.vocab.stoi[t] for t in tokenized_answer]

    tokenized_lex = ['<SOS>'] + lex_features + ['<EOS>']
    numericalized_lex = [LEX.vocab.stoi[t] for t in tokenized_lex]

    paragraph_length = torch.LongTensor([len(numericalized)]).to(model.device)
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(model.device)

    answer_tensor = torch.LongTensor(numericalized_answer).unsqueeze(1).to(model.device)
    lex_tensor = torch.LongTensor(numericalized_lex).unsqueeze(1).to(model.device)

    question_tensor_logits = model((tensor, paragraph_length), answer_tensor, lex_tensor, None, 0)

    question_tensor = torch.argmax(question_tensor_logits.squeeze(1), 1)
    question = [TEXT.vocab.itos[t] for t in question_tensor]

    # Start at the first index.  We don't need to return the <SOS> token
    question = question[1:]

    return question, question_tensor_logits

# Display prediction
num = 100
example_idx = random.sample(range(1,300),num)

for i in example_idx:
  src = vars(train_data.examples[i])['context']
  trg = vars(train_data.examples[i])['question']
  ans = vars(train_data.examples[i])['bio']
  lex = vars(train_data.examples[i])['lex']

  print('context: ', ' '.join(src))
  print('question: ', ' '.join(trg))

  question, logits = predict_question(model, src, ans, lex)
  print('predicted: ', " ".join(question))
  print()

for j in example_idx:
  src = vars(test_data.examples[j])['context']
  trg = vars(test_data.examples[j])['question']
  ans = vars(test_data.examples[j])['bio']
  lex = vars(test_data.examples[j])['lex']

  print('context: ', ' '.join(src))
  print('question: ', ' '.join(trg))

  question, logits = predict_question(model, src, ans, lex)
  print('predicted: ', " ".join(question))
  print()


In [None]:
def calculate_bleu_and_meteor(data, model):

    trgs = []
    pred_trgs = []
    meteor_score_ = []

    for datum in data:

        src = vars(datum)['context']
        trg = vars(datum)['question']
        ans = vars(datum)['bio']
        lex = vars(datum)['lex']

        pred_trg, _ = predict_question(model, src, ans, lex)

        #cut off <EOS> token
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        # print(pred_trg)
        trgs.append(trg)
        # print(trg)
        meteor_score_.append(single_meteor_score(' '.join(pred_trg),' '.join(trg)))

    bleu_score = corpus_bleu(pred_trgs, trgs)
    meteor_score_ = np.mean(meteor_score_)

    return bleu_score,meteor_score_

bleu_score, meteor_score_ = calculate_bleu_and_meteor(test_data, model)

print('BLEU score = {:.2f}'.format(bleu_score*100))
print('METEOR score = {:.2f}'.format(meteor_score_*100))

In [21]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [22]:
def calculate_bleu_and_meteor(data, model):
    trgs = []
    pred_trgs = []
    meteor_score_ = []

    for datum in data:
        src = vars(datum)['context']
        trg = vars(datum)['question']
        ans = vars(datum)['bio']
        lex = vars(datum)['lex']

        pred_trg, _ = predict_question(model, src, ans, lex)

        # Cut off <EOS> token
        pred_trg = pred_trg[:-1]

        # Convert lists to strings if needed
        trg = ' '.join(trg)
        pred_trg = ' '.join(pred_trg)

        pred_trgs.append(pred_trg)
        trgs.append(trg)

        meteor_score_.append(single_meteor_score(pred_trg.split(), trg.split()))

    bleu_score = corpus_bleu([[trg.split()] for trg in trgs], [pred_trg.split() for pred_trg in pred_trgs])
    meteor_score_ = np.mean(meteor_score_)

    return bleu_score, meteor_score_

# Calculate BLEU and METEOR scores
bleu_score, meteor_score_ = calculate_bleu_and_meteor(test_data, model)

print('BLEU score = {:.2f}'.format(bleu_score * 100))
print('METEOR score = {:.2f}'.format(meteor_score_ * 100))


BLEU score = 1.46
METEOR score = 16.17
