In [1]:
import json
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
)

In [2]:
TEST_FILE = "../../data/bioasq-data/training12b_new.json"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"][:5]  # for the test purpose, take one instance
print(data[0].keys())

dict_keys(['body', 'documents', 'ideal_answer', 'concepts', 'type', 'id', 'snippets'])


In [3]:
RETRIEVE_TOP_K = 200
RERANK_BM25 = 50
RERANK_CROSS = 25
RERANK_BI = 10
ABSTRACTS_FOR_ANSWER = 3

In [4]:
N = 0
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)

for question in tqdm(questions):
    N += 1
    retrieved = retrieve_abstracts.transform([question])
    # retrieved = pubmed_retrieve(question, RETRIEVE_TOP_K)
    retrieved["bm25"] = retrieve_bm25(question, retrieved)
    reranked = retrieved.sort_values("bm25", ascending=False).head(RERANK_BM25)
    retrieved["cos_sim"] = rerank_crossencoder(question, retrieved)
    reranked = retrieved.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)
    retrieved["cos_sim"] = rerank_biencoder(question, retrieved)
    reranked = retrieved.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    reranked.to_csv(f"temp/reranked/{question.id}.csv", index=False)
    # retrieved.to_csv(f"temp/reranked/retrieved_{question.id}.csv", index=False)

100%|██████████| 5/5 [00:51<00:00, 10.27s/it]


In [5]:
x = 0
questions[x].type, questions[x].body, questions[x].id

('summary',
 'Is Hirschsprung disease a mendelian or a multifactorial disorder?',
 '55031181e9bde69634000014')

In [6]:
reranked.head(15)

Unnamed: 0,docno,score,rank,title,text,url,bm25,cos_sim
65,38157760,0,66,Expression and functional characterization of ...,Receptor activator of nuclear factor Kappa-B L...,https://pubmed.ncbi.nlm.nih.gov/38157760/,2.090213,0.426827
127,37832903,0,128,Cooperation between T and B cells reinforce th...,Immune cells educated by the primary breast tu...,https://pubmed.ncbi.nlm.nih.gov/37832903/,7.139541,0.417535
122,37893470,0,123,The RANK-RANKL-OPG System: A Multifaceted Regu...,The RANK-RANKL-OPG system is a complex signali...,https://pubmed.ncbi.nlm.nih.gov/37893470/,2.21014,0.404899
1,38519718,0,2,Oestradiol and osteoclast differentiation: Eff...,Oestrogen deficiency increases bone resorption...,https://pubmed.ncbi.nlm.nih.gov/38519718/,0.0,0.403198
145,37750211,0,146,Rubiadin-1-methyl ether inhibits BECN1 transcr...,As an active substance isolated from the root of,https://pubmed.ncbi.nlm.nih.gov/37750211/,0.0,0.399161
97,38008422,0,98,GP2-expressing cells: a new guardian with dive...,"GP (glycoprotein)-2, originally identified as ...",https://pubmed.ncbi.nlm.nih.gov/38008422/,2.356248,0.380436
187,37463401,0,188,Inhibitory Effects of Wheat Sprouts Extract on...,The maintenance of bone is dependent on both o...,https://pubmed.ncbi.nlm.nih.gov/37463401/,1.72941,0.370273
16,38413562,0,17,The neutrophil-osteogenic cell axis promotes b...,The immune-stromal cell interactions play a ke...,https://pubmed.ncbi.nlm.nih.gov/38413562/,2.168425,0.364504
17,38402724,0,18,The stiffness and collagen control differentia...,Osteoclasts are hematopoietic cells attached t...,https://pubmed.ncbi.nlm.nih.gov/38402724/,1.814585,0.3621
135,37796390,0,136,The Differential Effect of Metformin on Osteoc...,"Metformin is an anti-glycemic agent, which is ...",https://pubmed.ncbi.nlm.nih.gov/37796390/,1.736201,0.347454


In [7]:
print(reranked.title.tolist()[0])
print(reranked.text.tolist()[0])
print(reranked.snippets_blablador_gpt.tolist()[0])
print(reranked.snippets_blablador_mistral.tolist()[0])

Expression and functional characterization of bovine receptor activator of NF-κB ligand (RANKL).
Receptor activator of nuclear factor Kappa-B Ligand (RANKL) is a member of the tumor necrosis factor ligand (TNF) family involved in immune responses and immunomodulation. Expressed in various cells types around the body, RANKL plays a crucial role in bone remodeling and development of the thymus, lymph nodes and mammary glands. Research in other species demonstrates that RANKL is required for the development of microfold cells (M cells) in the gut, however limited information specific to cattle is available. Cloning and expression of bovine RANKL (BoRANKL) was carried out and bioactivity of the protein was demonstrated in the induction of osteoclast differentiation from both bovine and ovine bone marrow cells. The effects of BoRANKL on particle uptake in bovine enteroids was also assessed. The production of cross-reactive bovine RANKL protein will enable further investigations into cell di

AttributeError: 'DataFrame' object has no attribute 'snippets_blablador_gpt'

In [None]:
reranked.abstract_snippets.tolist()[4]

['The differentially expressed genes were mainly associated with catalytic activity, oxidoreductase activity and transmembrane transporter activity, which significantly contributed to extracellular matrix-receptor interaction, fatty acids biosynthesis as well as glycine, serine, and threonine metabolism.']

In [None]:
# reranked["snippets_blablador_gpt"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="gpt-3.5-turbo"
#     ),
#     axis=1,
# )
# reranked["snippets_blablador_mistral"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
#     ),
#     axis=1,
# )
# reranked["title_snippets_gpt"] = [x[0] for x in reranked["snippets_blablador_gpt"]]
# reranked["abstract_snippets_gpt"] = [
#     x[1] for x in reranked["snippets_blablador_gpt"]
# ]
# reranked["title_snippets_mistral"] = [
#     x[0] for x in reranked["snippets_blablador_mistral"]
# ]
# reranked["abstract_snippets_mistral"] = [
#     x[1] for x in reranked["snippets_blablador_mistral"]
# ]

# reranked["offset_title_gpt"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_gpt"], x["text"]), axis=1
# )
# reranked["offset_abstract_gpt"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_gpt"], x["text"]), axis=1
# )

# reranked["offset_title_mistral"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
# )
# reranked["offset_abstract_mistral"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
# )

# reranked["snippets"] = reranked.apply(
#     lambda x: get_snippets(question.body, x["title"], x["text"]), axis=1
# )
# reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
# reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
# reranked["offset_title"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
# )
# reranked["offset_abstract"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
# )

# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)