In [1]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [8]:
import spacy
nlp = spacy.load("en_core_web_md")

In [9]:
def extract_features(text, answer, answer_start, nlp):
    '''
    Extract answers to obtain POS, NER, case, BIO features based on text

    Arguments:
        text	-- context or paragraph
        answer 	-- answer in paragraph's question
        answer_start -- starting index of answer
        nlp 	-- spacy tool for nlp
    Returns:
        pos 	-- sequence of string of answer tokens part-of-speech tagging
		ner 	-- sequence of string of answer tokens named entity recognition
		case	-- sequence of string of answer tokens case
		bio 	-- sequence of string of answer tokens inside-outside-beggining tagging
		tokenized 	-- joined tokenized context (paragraph) with lower typecasting
    '''
    
    # Extract answer location index (left, right and answers itself) in text
    left = text[0:answer_start]
    ans = text[answer_start:answer_start+len(answer)+1]
    right = text[answer_start+len(answer)+1:len(text)+1]    
    
    # Initialize return values list
    pos = []
    ner = []
    case = []
    bio = []
    tokenized = []
    
    left_side = nlp(left)
    answer_range = nlp(ans)
    right_side = nlp(right)
    
    for token in left_side:
        if token.text != '' and not token.text.isspace():
            tokenized.append(token.text.lower())
            pos.append(token.pos_)

            if token.ent_type_ == '':
                ner.append('O')
            else:
                ner.append(token.ent_type_)

            if token.text[0].isupper():
                case.append('UP')
            else:
                case.append('LOW')

            bio.append('O')
    
    for token in answer_range:
        if token.text != '' and not token.text.isspace():
            tokenized.append(token.text.lower())
            pos.append(token.pos_)

            if token.ent_type_ == '':
                ner.append('O')
            else:
                ner.append(token.ent_type_)

            if token.text[0].isupper():
                case.append('UP')
            else:
                case.append('LOW')

            if token.i == 0:
                bio.append('B')
            else:
                bio.append('I')
    
    for token in right_side:
        if token.text != '' and not token.text.isspace():
            tokenized.append(token.text.lower())
            pos.append(token.pos_)

            if token.ent_type_ == '':
                ner.append('O')
            else:
                ner.append(token.ent_type_)

            if token.text[0].isupper():
                case.append('UP')
            else:
                case.append('LOW')

            bio.append('O')
                
    return (' '.join(pos)), (' '.join(ner)), (' '.join(case)), (' '.join(bio)), (' '.join(tokenized))

In [10]:
def build_lexical_features(data):
    '''
    Creating pandas dataframe of features from parsed data

    Arguments:
        data -- data to be extracted; data must have context, answer, answer_start and question column
    Returns:
        data -- pandas dataframe of questions, context and features: IOB tag and lexical features(POS tag, NER, and case). 
    '''
    data['BIO'] = ''
    data['LEX'] = ''
    count = 0
    for idx, text, answer, answer_start, question in data[['context', 'answer', 'answer_start','question']].itertuples():
        print(text)
        pos, ner, case, data['BIO'][idx], data['context'][idx] = extract_features(text, str(answer), int(answer_start), nlp)
        lex = [i + '_' + j + '_' + k for i, j, k in zip(pos.split(), ner.split(), case.split())]
        data['LEX'][idx] = ' '.join(lex)
        data['question'][idx] = ' '.join([token.text.lower() for token in nlp(question)])
        count+=1
        print(count)

    # Building data on selected columns
    data = data[['context', 'question', 'BIO', 'LEX']]

    return data

In [11]:


# def build_lexical_features(data):
#     '''
#     Creating pandas dataframe of features from parsed data

#     Arguments:
#         data -- data to be extracted; data must have context, answer, answer_start, and question columns
#         nlp -- spaCy language model
#     Returns:
#         data -- pandas dataframe of questions, context, and features: IOB tag and lexical features (POS tag, NER, and case). 
#     '''
#     bio_tags = []
#     lex_features = []

#     for text, answer, answer_start, question in zip(data['context'], data['answer'], data['answer_start'], data['question']):
#         pos, ner, case, bio_tag, context = extract_features(text, str(answer), int(answer_start), nlp)
        
#         lex = [f"{i}_{j}_{k}" for i, j, k in zip(pos.split(), ner.split(), case.split())]
        
#         bio_tags.append(bio_tag)
#         lex_features.append(' '.join(lex))

#     data['BIO'] = bio_tags
#     data['LEX'] = lex_features

#     # Processing the 'question' column
#     data['question'] = data['question'].apply(lambda q: ' '.join([token.text.lower() for token in nlp(q)]))

#     # Building data on selected columns
#     data = data[['context', 'question', 'BIO', 'LEX']]

#     return data


In [12]:
import pandas as pd

context = 'Hugging Face provides a platform called the Model Hub where you can upload and share models, including PyTorch models. Here is a general outline of the steps to upload a PyTorch model to the Hugging Face Model Hub'
answer = 'Hugging Face Model Hub'
answer_start = 0
question = 'What is the name of the platform provided by Hugging Face?'
question_end = len(question) - 1

df = pd.DataFrame({'context':[context], 'question':[question], 'answer':[answer], 'answer_start':[answer_start]})

In [13]:
data = build_lexical_features(df)
data

Hugging Face provides a platform called the Model Hub where you can upload and share models, including PyTorch models. Here is a general outline of the steps to upload a PyTorch model to the Hugging Face Model Hub
1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos, ner, case, data['BIO'][idx], data['context'][idx] = extract_features(text, str(answer), int(answer_start), nlp)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos, ner, case, data['BIO'][idx], data['context'][idx] = extract_features(text, str(answer), int(answer_start), nlp)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['LEX'][idx] = ' '.join(lex)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http

Unnamed: 0,context,question,BIO,LEX
0,hugging face provides a platform called the mo...,what is the name of the platform provided by h...,B I I I O O O O O O O O O O O O O O O O O O O ...,VERB_O_UP PROPN_O_UP VERB_O_LOW PRON_O_LOW NOU...


In [14]:
# References: https://medium.com/@adam.wearne/seq2seq-with-pytorch-46dc00ff5164

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import random

class Encoder(nn.Module):

    def __init__(self, hidden_size, embedding_size,
                 embedding, answer_embedding, lexical_embedding, n_layers, dropout):

        super(Encoder, self).__init__()

        # Initialize network parameters
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Embedding layer to be shared with Decoder
        self.embedding = embedding
        self.answer_embedding = answer_embedding
        self.lexical_embedding = lexical_embedding

        # Bidirectional GRU
        self.gru = nn.GRU(embedding_size, hidden_size,
                          num_layers=n_layers,
                          dropout=dropout,
                          bidirectional=True)

    def forward(self, input_sequence, input_lengths, answer_sequence, lexical_sequence):

        # Convert input_sequence to word embeddings
        word_embeddings = self.embedding(input_sequence)
        answer_embeddings = self.answer_embedding(answer_sequence)
        lexical_embeddings = self.lexical_embedding(lexical_sequence)

        # Concatenate word embeddings from all features
        final_embeddings = torch.cat((word_embeddings,answer_embeddings,lexical_embeddings), 0)

        # Pack the sequence of embeddings
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(final_embeddings, input_lengths)

        # Run the packed embeddings through the GRU, and then unpack the sequences
        outputs, hidden = self.gru(packed_embeddings)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)

        # The ouput of a GRU has shape (seq_len, batch, hidden_size * num_directions)
        # Because the Encoder is bidirectional, combine the results from the
        # forward and reversed sequence by simply adding them together.
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]

        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()

        self.hidden_size = hidden_size

    def dot_score(self, hidden_state, encoder_states):
        # Attention model use the dot product formula as global attention
        return torch.sum(hidden_state * encoder_states, dim=2)

    def forward(self, hidden, encoder_outputs, mask):
        attn_scores = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_scores = attn_scores.t()

        # Apply mask so network does not attend <pad> tokens
        attn_scores = attn_scores.masked_fill(mask == 0, -1e10)

        # Return softmax over attention scores
        return F.softmax(attn_scores, dim=1).unsqueeze(1)

class Decoder(nn.Module):
    def __init__(self, embedding, embedding_size,
                 hidden_size, output_size, n_layers, dropout):

        super(Decoder, self).__init__()

        # Initialize network params
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding

        self.gru = nn.GRU(embedding_size, hidden_size, n_layers,
                          dropout=dropout)

        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attention(hidden_size)

    def forward(self, current_token, hidden_state, encoder_outputs, mask):

        # convert current_token to word_embedding
        embedded = self.embedding(current_token)

        # Pass through GRU
        rnn_output, hidden_state = self.gru(embedded, hidden_state)

        # Calculate attention weights
        attention_weights = self.attn(rnn_output, encoder_outputs, mask)

        # Calculate context vector
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1))

        # Concatenate  context vector and GRU output
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))

        # Pass concat_output to final output layer
        output = self.out(concat_output)

        # Return output and final hidden state
        return output, hidden_state

class Seq2seq(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size,
                 device, pad_idx, eos_idx, sos_idx, teacher_forcing_ratio=0.5):
        super(Seq2seq, self).__init__()

        # Initialize embedding layer shared by encoder and decoder
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.answer_embedding = nn.Embedding(6, embedding_size, padding_idx=1)
        # Size could sometime change, depend on the device that the model is trained on
        self.lexical_embedding = nn.Embedding(452, embedding_size, padding_idx=1)

        # Encoder network
        self.encoder = Encoder(hidden_size,
                               embedding_size,
                               self.embedding,
                               self.answer_embedding,
                               self.lexical_embedding,
                               n_layers=2,
                               dropout=0.5)

        # Decoder network
        self.decoder = Decoder(self.embedding,
                               embedding_size,
                               hidden_size,
                               vocab_size,
                               n_layers=2,
                               dropout=0.5)


        # Indices of special tokens and hardware device
        self.pad_idx = pad_idx
        self.eos_idx = eos_idx
        self.sos_idx = sos_idx
        self.device = device

    def create_mask(self, input_sequence):

        return (input_sequence != self.pad_idx).permute(1, 0)

    def forward(self, input_sequence, answer_sequence, lexical_sequence, output_sequence, teacher_forcing_ratio):

        # Unpack input_sequence tuple
        input_tokens = input_sequence[0]
        input_lengths = input_sequence[1]

        # Unpack output_tokens, or create an empty tensor for text generation
        if output_sequence is None:
            inference = True
            output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            output_tokens = output_sequence[0]

        vocab_size = self.decoder.output_size

        batch_size = len(input_lengths)
        max_seq_len = len(output_tokens)

        # Tensor initialization to store Decoder output
        outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)

        # Pass through the first half of the network
        encoder_outputs, hidden = self.encoder(input_tokens, input_lengths, answer_sequence, lexical_sequence)

        # Ensure dim of hidden_state can be fed into Decoder
        hidden =  hidden[:self.decoder.n_layers]

        # First input to the decoder is the <sos> tokens
        output = output_tokens[0,:]

        # Create mask
        mask = self.create_mask(input_tokens)

        # Step through the length of the output sequence one token at a time
        # Teacher forcing is used to assist training
        for t in range(1, max_seq_len):
            output = output.unsqueeze(0)

            output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (output_tokens[t] if teacher_force else top1)

            # If we're in inference mode, keep generating until we produce an
            # <eos> token
            if inference and output.item() == self.eos_idx:
                return outputs[:t]

        return outputs

In [15]:
data

Unnamed: 0,context,question,BIO,LEX
0,hugging face provides a platform called the mo...,what is the name of the platform provided by h...,B I I I O O O O O O O O O O O O O O O O O O O ...,VERB_O_UP PROPN_O_UP VERB_O_LOW PRON_O_LOW NOU...


In [16]:
csv_path = 'test.csv'
data.to_csv(csv_path, index=False)

In [11]:
# build_vocab  data
import torchtext
from torchtext.data import Field, TabularDataset
import spacy
import numpy as np
import pandas as pd
import random
import math
import time
import torch

# Set random seeds for reproducibility
SEED = 1234

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
random.seed(SEED)
np.random.seed(SEED)

# Load spacy models
spacy_en = spacy.load('en_core_web_md')

# Define fields
tokenize = lambda x: x.split()
TEXT = Field(tokenize=tokenize, lower=False, include_lengths = True, init_token = '<SOS>', eos_token = '<EOS>')
LEX = Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')
BIO = Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')


# Load data
fields = [('context', TEXT), ('question', TEXT), ('BIO', BIO), ('LEX', LEX)]

Device: cpu


In [12]:
test_dataset = TabularDataset(path=csv_path, format='csv', fields=fields, skip_header=True)

In [18]:
import os
DIR = os.getcwd()

In [38]:
# Code adapted from : https://github.com/bentrevett/pytorch-seq2seq
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
from torchtext.vocab import Vectors

from tqdm import tqdm
import random
import pandas as pd
import numpy as np

import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import single_meteor_score
## Wordnet dependencies from meteor score
#nltk.download('wordnet')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data
trainloc = os.path.join(DIR, 'results/resultssquad_train.csv')
valloc = os.path.join(DIR, 'dataset/validation_set.csv')
testloc = os.path.join(DIR, 'dataset/test_set.csv')
resume = os.path.join('models/model_14.pth')

# Create Field object
tokenize = lambda x: x.split()
TEXT = data.Field(tokenize=tokenize, lower=False, include_lengths = True, init_token = '<SOS>', eos_token = '<EOS>')
LEX = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')
BIO = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')

# Specify Fields in the dataset
fields = [('context', TEXT), ('question', TEXT), ('bio', BIO), ('lex', LEX)]

# Build the dataset
train_data, valid_data, test_data = data.TabularDataset.splits(path = '',train=trainloc, validation=valloc,
                                 test=testloc, fields = fields, format='csv', skip_header=True)

# Build vocabulary
MAX_VOCAB_SIZE = 35000
MIN_COUNT = 5

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE,
                min_freq=MIN_COUNT, vectors='glove.6B.300d',
                unk_init=torch.Tensor.normal_)
BIO.build_vocab(train_data)
LEX.build_vocab(train_data)

# Building model
pad_idx = TEXT.vocab.stoi['<pad>']
eos_idx = TEXT.vocab.stoi['<EOS>']
sos_idx = TEXT.vocab.stoi['<SOS>']

In [39]:
vocab_dir = os.path.join(DIR, 'vocabs')

In [44]:
# Save the vocabularies
torch.save(TEXT.vocab, os.path.join(vocab_dir, 'text_vocab.pth'))
torch.save(LEX.vocab, os.path.join(vocab_dir, 'lex_vocab.pth'))
torch.save(BIO.vocab, os.path.join(vocab_dir, 'bio_vocab.pth'))

In [45]:
# Load the vocabularies
text_vocab = torch.load(os.path.join(vocab_dir, 'text_vocab.pth'))
bio_vocab = torch.load(os.path.join(vocab_dir, 'bio_vocab.pth'))
lex_vocab = torch.load(os.path.join(vocab_dir, 'lex_vocab.pth'))

# Assign the loaded vocabularies to your fields
TEXT.vocab = text_vocab
BIO.vocab = bio_vocab
LEX.vocab = lex_vocab


In [46]:
print(len(TEXT.vocab), pad_idx, eos_idx, sos_idx)

35004 1 3 2


In [15]:
# Size of embedding_dim should match the dim of pre-trained word embeddings
embedding_dim = 300
hidden_dim = 512
vocab_size = len(TEXT.vocab)

# Initializing weights
model = Seq2seq(embedding_dim, hidden_dim, vocab_size, device, pad_idx, eos_idx, sos_idx).to(device)


In [29]:
# Size of embedding_dim should match the dim of pre-trained word embeddings
embedding_dim = 300
hidden_dim = 512
vocab_size = 35004

pad_idx = 1
eos_idx = 3
sos_idx = 2

# Initializing weights
model = Seq2seq(embedding_dim, hidden_dim, vocab_size, device, pad_idx, eos_idx, sos_idx).to(device)


In [30]:
# Load model
model.load_state_dict(torch.load(resume, map_location=torch.device('cpu')))

<All keys matched successfully>

In [31]:
test_dataset = data.TabularDataset(path=csv_path, format='csv', fields=fields, skip_header=True)

In [32]:

def predict_question(model, paragraph, answer_pos, lex_features):
    model.eval()

    tokenized = ['<SOS>'] + paragraph + ['<EOS>']
    numericalized = [TEXT.vocab.stoi[t] for t in tokenized]

    tokenized_answer = ['<SOS>'] + answer_pos + ['<EOS>']
    numericalized_answer = [BIO.vocab.stoi[t] for t in tokenized_answer]

    tokenized_lex = ['<SOS>'] + lex_features + ['<EOS>']
    numericalized_lex = [LEX.vocab.stoi[t] for t in tokenized_lex]

    paragraph_length = torch.LongTensor([len(numericalized)]).to(model.device)
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(model.device)

    answer_tensor = torch.LongTensor(numericalized_answer).unsqueeze(1).to(model.device)
    lex_tensor = torch.LongTensor(numericalized_lex).unsqueeze(1).to(model.device)

    question_tensor_logits = model((tensor, paragraph_length), answer_tensor, lex_tensor, None, 0)

    question_tensor = torch.argmax(question_tensor_logits.squeeze(1), 1)
    question = [TEXT.vocab.itos[t] for t in question_tensor]

    # Start at the first index.  We don't need to return the <SOS> token
    question = question[1:]

    return question, question_tensor_logits

# Display prediction
for example in test_dataset.examples:
   src = vars(example)['context']
   trg = vars(example)['question']
   ans = vars(example)['bio']
   lex = vars(example)['lex']

   print('context: ', ' '.join(src))
   print('question: ', ' '.join(trg))
   question, logits = predict_question(model, src, ans, lex)
   print('predicted: ', " ".join(question))
   print()

context:  hugging face provides a platform called the model hub where you can upload and share models , including pytorch models . here is a general outline of the steps to upload a pytorch model to the hugging face model hub
question:  what is the name of the platform provided by hugging face ?
predicted:  what is a used to find a computer ?

