In [1]:
import json

with open("train-v2.0.json", 'r') as f:
  data = json.load(f)

In [2]:
# get the available questions and answers for a given topic
def get_qa(topic, data):
    q = []
    a = []
    for d in data['data']:
        if d['title']==topic:
            for paragraph in d['paragraphs']:
                for qa in paragraph['qas']:
                    if not qa['is_impossible']:
                        q.append(qa['question'])
                        a.append(qa['answers'][0]['text'])
            return q,a

questions, answers = get_qa(topic='Premier_League', data=data)

print("Number of available questions: {}".format(len(questions)))

Number of available questions: 357


In [7]:
! pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [9]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved to /home/dev/.cache/huggingface/token
Login successful


In [10]:
! pip install transformers[torch]
! pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable




In [12]:
from transformers import AutoModel,AutoTokenizer

def get_model(model_name):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer
  
model, tokenizer = get_model(model_name="paraphrase-MiniLM-L6-v2")

In [13]:
import torch

# Mean Pooling - Take attention mask into account for correct averaging
# source: https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    
    input_mask_expanded = (
      attention_mask
      .unsqueeze(-1)
      .expand(token_embeddings.size())
      .float()
    )
    
    pool_emb = (
      torch.sum(token_embeddings * input_mask_expanded, 1) 
      / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    )
    
    return pool_emb

def get_embeddings(questions, tokenizer, model):
  # Tokenize sentences
  encoded_input = tokenizer(questions, padding=True, truncation=True, return_tensors='pt')

  # Compute token embeddings
  with torch.no_grad():
      model_output = model(**encoded_input)

  # Average pooling
  embeddings = mean_pooling(model_output, encoded_input['attention_mask']) 
  
  return embeddings

embeddings = get_embeddings(questions[:3], tokenizer, model)
print("Embeddings shape: {}".format(embeddings.shape))

Embeddings shape: torch.Size([3, 384])


In [14]:
new_question = 'Which days have the most events played at?'
new_embedding = get_embeddings([new_question], tokenizer, model)

# squared Euclidean distance between sample questions and new_question
((embeddings - new_embedding)**2).sum(axis=1)

tensor([71.4030, 59.8726, 23.9430])

In [15]:
class QAEmbedder:
  def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
    """
    Defines a QA embedding model. This is, given a set of questions,
    this class returns the corresponding embedding vectors.
    
    Args:
      model_name (`str`): Directory containing the necessary tokenizer
        and model files.
    """
    self.model = None
    self.tokenizer = None
    self.model_name = model_name
    self.set_model(model_name)
  
  
  def get_model(self, model_name):
    """
    Loads a general tokenizer and model using pytorch
    'AutoTokenizer' and 'AutoModel'
    
    Args:
      model_name (`str`): Directory containing the necessary tokenizer
        and model files.
    """
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer
  
  
  def set_model(self, model_name):
    """
    Sets a general tokenizer and model using the 'self.get_model'
    method.
    
    Args:
      model_name (`str`): Directory containing the necessary tokenizer
        and model files.
    """
    self.model, self.tokenizer = self.get_model(self.model_name)
  
  
  def _mean_pooling(self, model_output, attention_mask):
    """
    Internal method that takes a model output and an attention
    mask and outputs a mean pooling layer.
    
    Args:
      model_output (`torch.Tensor`): output from the QA model
      attention_mask (`torch.Tensor`): attention mask defined in the QA tokenizer
      
    Returns:
      The averaged tensor.
    """
    token_embeddings = model_output[0]
    
    input_mask_expanded = (
      attention_mask
      .unsqueeze(-1)
      .expand(token_embeddings.size())
      .float()
    )
    
    pool_emb = (
      torch.sum(token_embeddings * input_mask_expanded, 1) 
      / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    )
    
    return pool_emb
  
  
  def get_embeddings(self, questions, batch=32):
    """
    Gets the corresponding embeddings for a set of input 'questions'.
    
    Args:
      questions (`list` of `str`): List of strings defining the questions to be embedded
      batch (`int`): Performs the embedding job 'batch' questions at a time
      
    Returns:
      The embedding vectors.
    """
    question_embeddings = []
    for i in range(0, len(questions), batch):
    
        # Tokenize sentences
        encoded_input = self.tokenizer(questions[i:i+batch], padding=True, truncation=True, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # Perform mean pooling
        batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
        question_embeddings.append(batch_embeddings)
    
    question_embeddings = torch.cat(question_embeddings, dim=0)
    return question_embeddings

In [16]:
class QASearcher:
  def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
    """
    Defines a QA Search model. This is, given a new question it searches
    the most similar questions in a set 'context' and returns both the best
    question and associated answer.
    
    Args:
      model_name (`str`): Directory containing the necessary tokenizer
        and model files.
    """
    self.answers = None
    self.questions = None
    self.question_embeddings = None
    self.embedder = QAEmbedder(model_name=model_name)
  
  
  def set_context_qa(self, questions, answers):
    """
    Sets the QA context to be used during search.
    
    Args:
      questions (`list` of `str`):  List of strings defining the questions to be embedded
      answers (`list` of `str`): Best answer for each question in 'questions'
    """
    self.answers = answers
    self.questions = questions
    self.question_embeddings = self.get_q_embeddings(questions)
  
  
  def get_q_embeddings(self, questions):
    """
    Gets the embeddings for the questions in 'context'.
    
    Args:
      questions (`list` of `str`):  List of strings defining the questions to be embedded
    
    Returns:
      The embedding vectors.
    """
    question_embeddings = self.embedder.get_embeddings(questions)
    question_embeddings  = torch.nn.functional.normalize(question_embeddings, p=2, dim=1)
    return question_embeddings.transpose(0,1)
  
  
  def cosine_similarity(self, questions, batch=32):
    """
    Gets the cosine similarity between the new questions and the 'context' questions.
    
    Args:
      questions (`list` of `str`):  List of strings defining the questions to be embedded
      batch (`int`): Performs the embedding job 'batch' questions at a time
    
    Returns:
      The cosine similarity
    """
    question_embeddings = self.embedder.get_embeddings(questions, batch=batch)
    question_embeddings = torch.nn.functional.normalize(question_embeddings, p=2, dim=1)
    
    cosine_sim = torch.mm(question_embeddings, self.question_embeddings)
    
    return cosine_sim
  
  
  def get_answers(self, questions, batch=32):
    """
    Gets the best answers in the stored 'context' for the given new 'questions'.
    
    Args:
      questions (`list` of `str`):  List of strings defining the questions to be embedded
      batch (`int`): Performs the embedding job 'batch' questions at a time
    
    Returns:
      A `list` of `dict`'s containing the original question ('orig_q'), the most similar
      question in the context ('best_q') and the associated answer ('best_a').
    """
    similarity = self.cosine_similarity(questions, batch=batch)
    
    response = []
    for i in range(similarity.shape[0]):
      best_ix = similarity[i].argmax()
      best_q = self.questions[best_ix]
      best_a = self.answers[best_ix]
      
      response.append(
        {
          'orig_q':questions[i],
          'best_q':best_q,
          'best_a':best_a,
        }
      )
    
    return response

In [17]:
! pip install uvicorn
! pip install fastapi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable


In [18]:
import uvicorn
from fastapi import FastAPI, Request

qa_search = QASearcher()
app = FastAPI()

@app.post("/set_context")
async def set_context(data:Request):
  """
  Fastapi POST method that sets the QA context for search.
  
  Args:
    data(`dict`): Two fields required 'questions' (`list` of `str`)
      and 'answers' (`list` of `str`)
  """
  data = await data.json()
  
  qa_search.set_context_qa(
    data['questions'], 
    data['answers']
  )
  return {"message": "Search context set"}


@app.post("/get_answer")
async def get_answer(data:Request):
  """
  Fastapi POST method that gets the best question and answer 
  in the set context.
  
  Args:
    data(`dict`): One field required 'questions' (`list` of `str`)
  
  Returns:
    A `dict` containing the original question ('orig_q'), the most similar
    question in the context ('best_q') and the associated answer ('best_a').
  """
  data = await data.json()
  
  response = qa_search.get_answers(data['questions'], batch=1)
  return response

In [24]:
import requests

json_data = {
  'questions':questions,
  'answers':answers,
}

response = requests.post(
  'http://0.0.0.0:8000/set_context',
  json=json_data
)

response.json()

{'message': 'Search context set'}

In [25]:
new_questions = [
    'How many teams compete in the Premier League ?',
    'When does the Premier League starts and finishes ?',
    'Who has the highest number of goals in the Premier League ?',
]

json_data = {
  'questions':new_questions,
}

response = requests.post(
  'http://0.0.0.0:8000/get_answer',
  json=json_data
)

for d in response.json():
  print('\n'.join(["{} : {}".format(k, v) for k,v in d.items()])+'\n')

orig_q : How many teams compete in the Premier League ?
best_q : How many clubs are currently in the Premier League?
best_a : 20

orig_q : When does the Premier League starts and finishes ?
best_q : When does the Premier League have its playing season?
best_a : During the course of a season (from August to May)

orig_q : Who has the highest number of goals in the Premier League ?
best_q : Who has the record for most goals in the Premier League?
best_a : Newcastle United striker Alan Shearer holds the record for most Premier League goals with 260

