# Import Libraries

In [None]:
!pip install-qq transformers[sentencepiece]==4.35.2 datasets==2.16.1 evaluate==0.4.1
!sudo apt-get install libomp-dev
!pip install-qq faiss-gpu

In [None]:
import numpy as np
import collections
import torch
import faiss
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Dataset: SQuAD2.0

In [None]:
DATASET_NAME = 'squad_v2'
raw_datasets = load_dataset(DATASET_NAME, split='train+validation')
raw_datasets

# Remove unsovled labels

In [None]:
raw_datasets = raw_datasets.filter(
  lambda x: len(x['answers']['text']) > 0
)

# Model

In [None]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

# Vector embedding

In [None]:
def cls_pooling(model_output):
  return model_output.last_hidden_state[:, 0]

In [None]:
def get_embeddings(text_list):

    encoded_input = tokenizer(
        text_list,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

# Vector database

In [None]:
EMBEDDING_COLUMN = 'question_embedding'
embeddings_dataset = raw_datasets.map(
    lambda x: {
        EMBEDDING_COLUMN: get_embeddings(
        x['question']
        ).detach().cpu().numpy()[0]
        }
)

In [None]:
embeddings_dataset.add_faiss_index(column=EMBEDDING_COLUMN)

In [None]:
input_question = 'When did Beyonce start becoming popular?'

input_quest_embedding = get_embeddings([input_question])
input_quest_embedding = input_quest_embedding.cpu().detach().numpy()

TOP_K = 5
scores, samples = embeddings_dataset.get_nearest_examples(

EMBEDDING_COLUMN, input_quest_embedding, k=TOP_K
)

for idx, score in enumerate(scores):

 print(f'Top {idx + 1}\tScore: {score}')
 print(f'Question: {samples["question"][idx]}')
 print(f'Context: {samples["context"][idx]}')
 print()

# Apply model

In [None]:
from transformers import pipeline

PIPELINE_NAME = 'question-answering'
MODEL_NAME = 'thangduong0509/distilbert-finetuned-squadv2'
pipe = pipeline(PIPELINE_NAME, model=MODEL_NAME)

In [None]:
print(f'Input question: {input_question}')
for idx, score in enumerate(scores):

 question = samples["question"][idx]

 context = samples["context"][idx]
 answer = pipe(
 question=question,
 context=context
 )
 print(f'Top {idx + 1}\tScore: {score}')
 print(f'Context: {context}')
 print(f'Answer: {answer}')
 print()