# Retrieval using first 3 queries from sampled_queries.tsv

In [10]:
import pandas as pd
sampled_queries = pd.read_csv('sampled_queries_1k.tsv', sep='\t', header=0, names=['qid', 'query'])
test_queries = sampled_queries.head(1)
test_queries

Unnamed: 0,qid,query
0,507646,symptoms of flu a & b in children


In [11]:
collection_df = pd.read_csv("common_dataset_80k.tsv", sep="\t", header=None, names=["pid", "text"], dtype={"pid": str, "text": str})
collection_df.head()

Unnamed: 0,pid,text
0,448,A postal code (also known locally in various E...
1,466,"Therefore, all pathologists must have complete..."
2,646,Obesity is a complex disorder involving an exc...
3,1212,Which president appointed FBI Director James C...
4,1213,"Comey was confirmed by the Senate on July 29, ..."


In [12]:
def format_topN_results(topN_results, test_queries, collection_df):
    rows = []
    for qid, results in topN_results.items():
        query = test_queries.loc[test_queries['qid'] == int(qid), 'query'].values[0]
        for pid, score in results:
            passage = collection_df.loc[collection_df['pid'] == pid, 'text'].values[0]
            rows.append({'query': query, 'pid': pid, 'passage': passage, 'score': score})
    return pd.DataFrame(rows)

### Test retrieval with TF-IDF

In [13]:
import joblib
from tf_idf_utils import retrieve_topN_for_queries

vectorizer = joblib.load('tfidf_vectorizer.joblib')
doc_matrix = joblib.load('tfidf_doc_matrix.joblib')

topN_results = retrieve_topN_for_queries(vectorizer, doc_matrix, collection_df['pid'], test_queries, topN=10)

retrieval_df = format_topN_results(topN_results, test_queries, collection_df)
retrieval_df.head()


Retrieving: 100%|██████████| 1/1 [00:00<00:00, 63.03it/s]


Unnamed: 0,query,pid,passage,score
0,symptoms of flu a & b in children,822848,Symptoms of flu. The symptoms of flu usually d...,0.394323
1,symptoms of flu a & b in children,7071178,What are the common flu symptoms? A: Common sy...,0.345496
2,symptoms of flu a & b in children,7201019,The main difference between cold and flu is th...,0.34074
3,symptoms of flu a & b in children,1580421,Detecting early symptoms of the flu can preven...,0.313504
4,symptoms of flu a & b in children,196474,All symptoms of the flu are usually gone in 7 ...,0.28917


## Test retrieval with BM25

In [14]:
from whoosh import index
from whoosh.qparser import QueryParser, OrGroup
from whoosh.scoring import BM25F
from tqdm.auto import tqdm
IDX_DIR = "indexes/whoosh"
K1, B = 1.2, 0.75

topN_results = {}

ix = index.open_dir(IDX_DIR)
with ix.searcher(weighting=BM25F(k1=K1, b=B)) as searcher:
    qp = QueryParser("text", schema=ix.schema, group=OrGroup)
    it = test_queries[["qid","query"]].itertuples(index=False, name=None)

    for qid, query in it:
        q = qp.parse(query)
        results = searcher.search(q, limit=10)
        rows = [(r['pid'], r.score) for r in results]
        topN_results[str(qid)] = rows

retrieval_df = format_topN_results(topN_results, test_queries, collection_df)
retrieval_df.head()

Unnamed: 0,query,pid,passage,score
0,symptoms of flu a & b in children,1580421,Detecting early symptoms of the flu can preven...,22.446466
1,symptoms of flu a & b in children,7087923,The flu is caused by a virus. Common symptoms ...,21.515997
2,symptoms of flu a & b in children,7988185,Yes | No Thank you! Flu shots are not made for...,20.992554
3,symptoms of flu a & b in children,7492976,1 A specific syrup containing elderberry juice...,20.895779
4,symptoms of flu a & b in children,35887,The Flu Is Contagious Most healthy adults may ...,20.811622


## Test retrieval with DPR

In [15]:
import numpy as np
embedding_filename = f"passage_embeddings_80k.npy"

# Load later
passage_embeddings = np.load(embedding_filename)
print(passage_embeddings.shape)

(80000, 768)


In [16]:
import faiss
dim = passage_embeddings.shape[1]  # typically 768 for DPR
faiss.normalize_L2(passage_embeddings)  # normalize for cosine similarity
index = faiss.IndexFlatIP(dim)
index.add(passage_embeddings)

print("Number of vectors in FAISS:", index.ntotal)

Number of vectors in FAISS: 80000


In [17]:
import numpy as np
import torch
import faiss
from tqdm import tqdm
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

question_encoder = DPRQuestionEncoder.from_pretrained("./dpr_question_encoder").to(DEVICE)
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("./dpr_question_encoder")

qids = test_queries['qid'].tolist()
queries_list = test_queries['query'].tolist()

for qid, q in tqdm(zip(qids, queries_list), total=len(queries_list), desc="Retrieving"):
    inputs = question_tokenizer(q, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
    with torch.no_grad():
        q_emb = question_encoder(**inputs).pooler_output
    D, I = index.search(q_emb.detach().cpu().numpy(), 10)

    rows = [(collection_df.iloc[i]['pid'], float(d)) for i, d in zip(I[0], D[0])]
    topN_results[str(qid)] = rows

retrieval_df = format_topN_results(topN_results, test_queries, collection_df)
retrieval_df.head()

Retrieving:   0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Retrieving: 100%|██████████| 1/1 [00:00<00:00, 32.50it/s]


Unnamed: 0,query,pid,passage,score
0,symptoms of flu a & b in children,7548862,A: Symptoms of influenza in children include a...,11.732754
1,symptoms of flu a & b in children,7619619,Below are the symptoms that some individuals m...,10.516277
2,symptoms of flu a & b in children,7828612,Flu Symptoms. The most common symptoms of the ...,9.839459
3,symptoms of flu a & b in children,7480406,Symptoms of TEF in adult patients may include:...,9.776333
4,symptoms of flu a & b in children,109900,"Influenza, commonly known as the flu, is an in...",9.57412


## Test re-ranking with Cross Encoder

In [18]:
from sentence_transformers import CrossEncoder

cross_model = CrossEncoder("./cross-encoder-model")

pairs = list(zip(retrieval_df["query"], retrieval_df["passage"]))

cross_scores = cross_model.predict(pairs, show_progress_bar=True)
retrieval_df["cross_score"] = cross_scores
retrieval_df = retrieval_df.sort_values(by=["cross_score"], ascending=[False])
retrieval_df.head(10)

Batches: 100%|██████████| 1/1 [00:00<00:00, 119.19it/s]


Unnamed: 0,query,pid,passage,score,cross_score
0,symptoms of flu a & b in children,7548862,A: Symptoms of influenza in children include a...,11.732754,6.749187
5,symptoms of flu a & b in children,7590755,The list of signs and symptoms mentioned in va...,9.549688,5.264628
4,symptoms of flu a & b in children,109900,"Influenza, commonly known as the flu, is an in...",9.57412,5.243701
2,symptoms of flu a & b in children,7828612,Flu Symptoms. The most common symptoms of the ...,9.839459,3.305002
7,symptoms of flu a & b in children,7480405,Symptoms of TEF in infants are generally worse...,9.446805,-2.090311
6,symptoms of flu a & b in children,7649123,Other mild childhood illnesses: EBV infection ...,9.508144,-8.698321
3,symptoms of flu a & b in children,7480406,Symptoms of TEF in adult patients may include:...,9.776333,-9.161399
9,symptoms of flu a & b in children,7661062,About 1 out of 4 people with poliovirus infect...,9.42206,-10.012847
8,symptoms of flu a & b in children,964216,Signs and symptoms of depression in teens. 1 ...,9.438099,-10.15023
1,symptoms of flu a & b in children,7619619,Below are the symptoms that some individuals m...,10.516277,-10.312838
