In [None]:
#https://rsilveira79.github.io/fermenting_gradients/machine_learning/nlp/pytorch/pytorch-transformer-squad/
#download model from https://drive.google.com/drive/folders/1OnvT5sKgi0WVWTXnTaaOPTE5KIh-xg_E
#download model from https://drive.google.com/drive/folders/1e7wu9yI-rGkSzjoPU2TpCC9FMvlKvl8R

In [1]:
import os
import time
import torch
from pytorch_transformers import BertConfig, BertTokenizer, BertForQuestionAnswering
from pytorch_transformers import XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer

In [2]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer)
}

In [3]:
class QuestionAnswering(object):
    def __init__(self, config_file, weight_file, tokenizer_file, model_type ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.config_class, self.model_class, self.tokenizer_class = MODEL_CLASSES[model_type]
        self.config = self.config_class.from_json_file(config_file)
        self.model = self.model_class(self.config)
        self.model.load_state_dict(torch.load(weight_file, map_location=self.device))
        self.tokenizer = self.tokenizer_class(tokenizer_file)
        self.model_type = model_type
    
    def to_list(self, tensor):
        return tensor.detach().cpu().tolist()

    def get_reply(self, question, passage):
        self.model.eval()
        with torch.no_grad():
            input_ids, _ , tokens = self.prepare_features(question, passage)
            if self.model_type == 'bert':
                span_start,span_end= self.model(input_ids)
                answer = tokens[torch.argmax(span_start):torch.argmax(span_end)+1]
                answer = self.bert_convert_tokens_to_string(answer)
            elif self.model_type == 'xlnet':
                input_vector = {'input_ids': input_ids,
                                'start_positions': None,
                                'end_positions': None }
                outputs = self.model(**input_vector)
                answer = tokens[self.to_list(outputs[1])[0][torch.argmax(outputs[0])]:self.to_list(outputs[3])[0][torch.argmax(outputs[2])]+1]
                answer = self.xlnet_convert_tokens_to_string(answer)
        return answer
    
    def bert_convert_tokens_to_string(self, tokens):
        out_string = ' '.join(tokens).replace(' ##', '').strip()
        if '@' in tokens:
            out_string = out_string.replace(' ', '')
        return out_string

    def xlnet_convert_tokens_to_string(self, tokens):
        out_string = ''.join(tokens).replace('▁', ' ').strip()
        return out_string

    def prepare_features(self, question,  passage, max_seq_length = 300, 
                 zero_pad = False, include_CLS_token = True, include_SEP_token = True):
        ## Tokenzine Input
        tokens_a = self.tokenizer.tokenize(question)
        tokens_b = self.tokenizer.tokenize(passage)
        ## Truncate
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]
        ## Initialize Tokens
        tokens = []
        if include_CLS_token:
            tokens.append(self.tokenizer.cls_token)
        ## Add Tokens and separators
        for token in tokens_a:
            tokens.append(token)
        if include_SEP_token:
            tokens.append(self.tokenizer.sep_token)
        for token in tokens_b:
            tokens.append(token)
        ## Convert Tokens to IDs
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        ## Input Mask 
        input_mask = [1] * len(input_ids)
        ## Zero-pad sequence lenght
        if zero_pad:
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
        return torch.tensor(input_ids).unsqueeze(0), input_mask, tokens

In [4]:
bert = QuestionAnswering(
    config_file =   'bert-large-cased-whole-word-masking-finetuned-squad-config.json',
    weight_file=    'bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin',
    tokenizer_file= 'bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt',
    model_type =    'bert'
)

<h3> Feed your Model With The Passage </h3>

In [5]:
facts = '''Insect behavior generally appears to be explicable in terms of unconscious, inflexible stimulus-response
mechanisms. For instance, a female sphex wasp leaves her egg sealed in a burrow alongside a paralyzed
grasshopper, which her larvae can eat upon hatching. Before she deposits the grasshopper in the burrow, she
inspects the burrow; if the inspection reveals no problems, she drags the grasshopper inside by its antennae. As
thoughtful as this behavior appears, it reveals its mechanistic character upon interference. Darwin discovered
that prior removal of the grasshopper's antennae prevents the wasp from depositing the grasshopper, even though
the legs or ovipositor could also serve as handles. Likewise, Fabre moved the grasshopper a few centimeters
away from the burrow's mouth while the wasp was inside inspecting. The wasp returned the grasshopper to the
edge of the burrow and then began a new inspection. Fabre performed this disruptive maneuver forty times; the
wasp's response never changed.
'''

<h3> Ask your Questoins in Here </h3>

In [8]:
questions = [
    "The author mentions the work of Darwin and Fabre in order to?",
    "Which of the following hypothetical variations in the experiments described in the passage would most weaken the primary claim of the passage?"
]

<h3> Get your answers </h3>

In [9]:
for question in questions:
    print('Question:',question)
    print('Answer:  ',bert.get_reply(question,facts))

Question: The author mentions the work of Darwin and Fabre in order to?
Answer:   [CLS]
Question: Which of the following hypothetical variations in the experiments described in the passage would most weaken the primary claim of the passage?
Answer:   [CLS]
