In [1]:
import json

from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
)

In [2]:
TEST_FILE = "../../data/bioasq-data/training12b_new.json"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"][2:3]  # for the test purpose, take one instance
print(data[0].keys())

dict_keys(['body', 'documents', 'ideal_answer', 'exact_answer', 'type', 'id', 'snippets'])


In [3]:
RETRIEVE_TOP_K = 5
RERANK_TOP_K = 5
ABSTRACTS_FOR_ANSWER = 3

In [7]:
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
for question in questions:
    retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)
    retrieved = retrieve_abstracts.transform(questions)
    retrieved["cos_sim"] = rerank_biencoder(question, retrieved)
    reranked = retrieved.sort_values("cos_sim", ascending=False)[:RERANK_TOP_K]
    reranked["snippets"] = reranked.apply(
        lambda x: get_snippets(question.body, x["title"], x["text"]), axis=1
    )
    reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
    reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
    reranked["offset_title"] = reranked.apply(
        lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
    )
    reranked["offset_abstract"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
    )
    reranked["answer_abstracts_exact"] = [
        response_exact_answer(
            question.body,
            question.type,
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(reranked)
    reranked["answer_abstracts_ideal"] = [
        response_ideal_answer(
            question.body,
            question.type,
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(reranked)
    reranked["answer_snippets_exact"] = [
        response_exact_answer(
            question.body,
            question.type,
            " ".join(
                flat_list(reranked["abstract_snippets"].tolist())
                + flat_list(reranked["title_snippets"])
            ),
        )
    ] * len(reranked)
    reranked["answer_snippets_ideal"] = [
        response_ideal_answer(
            question.body,
            question.type,
            " ".join(
                flat_list(reranked["abstract_snippets"].tolist())
                + flat_list(reranked["title_snippets"])
            ),
        )
    ] * len(reranked)

['We also observed the accumulation of many structural and extracellular proteins such as papilin-like protein, which glycosylation increase its stability-based molecular modeling.', 'The differentially expressed genes were mainly associated with catalytic activity, oxidoreductase activity and transmembrane transporter activity, which significantly contributed to extracellular matrix-receptor interaction, fatty acids biosynthesis as well as glycine, serine, and threonine metabolism.']
[]
['We also observed the accumulation of many structural and extracellular proteins such as papilin-like protein, which glycosylation increase its stability-based molecular modeling.', 'The differentially expressed genes were mainly associated with catalytic activity, oxidoreductase activity and transmembrane transporter activity, which significantly contributed to extracellular matrix-receptor interaction, fatty acids biosynthesis as well as glycine, serine, and threonine metabolism.']
[]


In [9]:
questions[0].type, questions[0].body

('yesno', 'Is the protein Papilin secreted?')

In [8]:
reranked.head()

Unnamed: 0,docno,score,rank,title,text,url,cos_sim,snippets,title_snippets,abstract_snippets,offset_title,offset_abstract,answer_abstracts_exact,answer_abstracts_ideal,answer_snippets_exact,answer_snippets_ideal
0,38284126,0,1,Secreted ADAMTS-like proteins as regulators of...,The extracellular matrix (ECM) determines func...,https://pubmed.ncbi.nlm.nih.gov/38284126/,0.362601,"([], [])",[],[],[],[],yes,"[Yes, the protein Papilin is secreted.]",yes,"[Based on the information provided, it is not ..."
2,34240827,0,3,Primary angle closure glaucoma is characterize...,To characterize the proteome of the iris in pr...,https://pubmed.ncbi.nlm.nih.gov/34240827/,0.302506,"([], [])",[],[],[],[],yes,"[Yes, the protein Papilin is secreted.]",yes,"[Based on the information provided, it is not ..."
1,36997062,0,2,The ovaries of ivermectin-resistant Rhipicepha...,Controlling Rhipicephalus microplus is among t...,https://pubmed.ncbi.nlm.nih.gov/36997062/,0.297135,"([], [We also observed the accumulation of man...",[],[We also observed the accumulation of many str...,[],"[(868, 1046)]",yes,"[Yes, the protein Papilin is secreted.]",yes,"[Based on the information provided, it is not ..."
4,32585132,0,5,Comprehensive Endogenous Tagging of Basement M...,Basement membranes (BMs) are supramolecular ma...,https://pubmed.ncbi.nlm.nih.gov/32585132/,0.279801,"([], [])",[],[],[],[],yes,"[Yes, the protein Papilin is secreted.]",yes,"[Based on the information provided, it is not ..."
3,33590535,0,4,Transcriptional changes revealed genes and pat...,Sphingolipids are ubiquitous structural compon...,https://pubmed.ncbi.nlm.nih.gov/33590535/,0.208988,"([], [The differentially expressed genes were ...",[],[The differentially expressed genes were mainl...,[],"[(1058, 1361)]",yes,"[Yes, the protein Papilin is secreted.]",yes,"[Based on the information provided, it is not ..."


In [10]:
reranked.answer_snippets_ideal.tolist()[0]

['Based on the information provided, it is not clear whether the protein Papilin is secreted or not. Further studies are needed to determine its cellular localization and secretion mechanism.']

In [31]:
reranked.abstract_snippets.tolist()[4]

['The differentially expressed genes were mainly associated with catalytic activity, oxidoreductase activity and transmembrane transporter activity, which significantly contributed to extracellular matrix-receptor interaction, fatty acids biosynthesis as well as glycine, serine, and threonine metabolism.']

In [2]:
question = "Please list 2 human diseases caused by a coronavirus."
title = "Host Factors in Coronavirus Replication"
abstract = "Coronaviruses are pathogens with a serious impact on human and animal health. They mostly cause enteric or respiratory disease, which can be severe and life threatening, e.g., in the case of the zoonotic coronaviruses causing severe acute respiratory syndrome (SARS) and Middle East Respiratory Syndrome (MERS) in humans. Despite the economic and societal impact of such coronavirus infections, and the likelihood of future outbreaks of additional pathogenic coronaviruses, our options to prevent or treat coronavirus infections remain very limited. This highlights the importance of advancing our knowledge on the replication of these viruses and their interactions with the host. Compared to other +RNA viruses, coronaviruses have an exceptionally large genome and employ a complex genome expression strategy. Next to a role in basic virus replication or virus assembly, many of the coronavirus proteins expressed in the infected cell contribute to the coronavirus-host interplay. For example, by interacting with the host cell to create an optimal environment for coronavirus replication, by altering host gene expression or by counteracting the host's antiviral defenses. These coronavirus-host interactions are key to viral pathogenesis and will ultimately determine the outcome of infection. Due to the complexity of the coronavirus proteome and replication cycle, our knowledge of host factors involved in coronavirus replication is still in an early stage compared to what is known for some other +RNA viruses. This review summarizes our current understanding of coronavirus-host interactions at the level of the infected cell, with special attention for the assembly and function of the viral RNA-synthesising machinery and the evasion of cellular innate immune responses."

In [4]:
import re

a = "Lorem ipsum dolor sit amet"
re.search("ipsum", a).span()
(6, 11)

(6, 11)