# **Import Liraries**

In [1]:
!pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m920.6 kB/s[0m eta [36m0:00:00[0m:01[0m0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
!sudo apt-get install libomp-dev -y
!pip install -qq faiss-gpu

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libllvm14 libomp-14-dev libomp5-14
Suggested packages:
  libomp-14-doc
The following NEW packages will be installed:
  libllvm14 libomp-14-dev libomp-dev libomp5-14
0 upgraded, 4 newly installed, 0 to remove and 39 not upgraded.
Need to get 24.7 MB of archives.
After this operation, 118 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libllvm14 amd64 1:14.0.0-1ubuntu1.1 [24.0 MB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp5-14 amd64 1:14.0.0-1ubuntu1.1 [389 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp-14-dev amd64 1:14.0.0-1ubuntu1.1 [347 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libomp-dev amd64 1:14.0-55~exp2 [3074 B]
Fetched 24.7 MB in 2s (11.5 MB/s)    
Selecting previously unselected packag

In [3]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# **Download Dataset**

In [4]:
DATASET = 'rajpurkar/squad_v2'
raw_datasets = load_dataset(DATASET, split='train+validation')

Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [5]:
raw_datasets

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 142192
})

In [6]:
# Filter out non-answerable samples
raw_datasets = raw_datasets.filter(
    lambda x: len(x['answers']['text']) > 0
)
len(raw_datasets)

Filter:   0%|          | 0/142192 [00:00<?, ? examples/s]

92749

In [7]:
columns = raw_datasets.column_names
columns_to_keep = ['id', 'context', 'question', 'answers']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
raw_datasets = raw_datasets.remove_columns(columns_to_remove)
raw_datasets

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 92749
})

# **Initialize pre-trained model**

In [8]:
MODEL_NAME = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# **Create get vector embedding functions**

In [9]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [10]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [11]:
encoded_input = get_embeddings(raw_datasets['question'][0])
encoded_input.shape

torch.Size([1, 768])

In [12]:
EMBEDDING_COLUMN = 'question_embedding'
embedding_dataset = raw_datasets.map(
    lambda x: {EMBEDDING_COLUMN: get_embeddings(x['question']).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/92749 [00:00<?, ? examples/s]

In [13]:
embedding_dataset.add_faiss_index(column=EMBEDDING_COLUMN)

  0%|          | 0/93 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'context', 'question', 'answers', 'question_embedding'],
    num_rows: 92749
})

# **Search similar samples with a question**

In [14]:
input_question = 'When did Beyonce start becoming popular?'

input_question_embedding = get_embeddings([input_question]).cpu().detach().numpy()
input_question_embedding.shape

(1, 768)

In [16]:
TOP_K = 5
scores, samples = embedding_dataset.get_nearest_examples(
    EMBEDDING_COLUMN, 
    input_question_embedding,
    k=TOP_K
)

In [21]:
for idx, score in enumerate(scores):
    print(f'Top {idx+1}\tScore: {score}')
    print(f'Question: {samples["question"][idx]}')
    print(f'Context: {samples["context"][idx]}')
    print()

Top 1	Score: 0.0
Question: When did Beyonce start becoming popular?
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".

Top 2	Score: 2.6135313510894775
Question: When did Beyoncé rise to fame?
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress

# **QA**

In [22]:
from transformers import pipeline

PIPELINE_NAME = 'question-answering'
MODEL_NAME = 'tuong2402/distilbert-finetuned-squadv2'
pipe = pipeline(PIPELINE_NAME, MODEL_NAME)

config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


# **Predict**

In [51]:
def predict(input_question):
    input_question_embedding = get_embeddings([input_question]).detach().cpu().numpy()
    
    scores, examples = embedding_dataset.get_nearest_examples(
        EMBEDDING_COLUMN,
        input_question_embedding,
        k=1
    )
    
    answer = pipe(
        question=examples['question'][0],
        context=examples['context'][0]
    )
    
    return scores[0], examples['question'][0], examples['context'][0], answer['answer']

In [59]:
input_question = 'Who discovered tyrothricin?'

score, question, context, answer = predict(input_question)
print(f'Score: {score}')
print(f'Question: {question}')
print(f'Context: {context}')
print(f'Answer: {answer}')

Score: 0.0
Question: Who discovered tyrothricin?
Context: In 1939, coinciding with the start of World War II, Rene Dubos reported the discovery of the first naturally derived antibiotic, tyrothricin, a compound of 20% gramicidin and 80% tyrocidine, from B. brevis. It was one of the first commercially manufactured antibiotics universally and was very effective in treating wounds and ulcers during World War II. Gramicidin, however, could not be used systemically because of toxicity. Tyrocidine also proved too toxic for systemic usage. Research results obtained during that period were not shared between the Axis and the Allied powers during the war.
Answer: Rene Dubos
