In [1]:
import logging as logger

from dataclasses import dataclass, field
from typing import Optional, Dict, Any
from xml.etree.ElementTree import fromstring, Element

from pandas import DataFrame
from pandas.core.groupby import DataFrameGroupBy
from pyterrier.transformer import Transformer
from requests import get
from tqdm.auto import tqdm


@dataclass
class PubMedApiRetrieve:
    name = "PubMedApiRetrieve"

    eutils_api_base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    esearch_api_url = f"{eutils_api_base_url}/esearch.fcgi"
    efetch_api_url = f"{eutils_api_base_url}/efetch.fcgi"

    query_field: str = "query"
    num_results: Optional[int] = 10
    verbose: bool = field(repr=False, default=False)

    def _transform_query(self, topic: DataFrame) -> DataFrame:
        if len(topic.index) != 1:
            raise RuntimeError("Can only transform one query at a time.")

        row: Dict[str, Any] = topic.to_dict(orient="records")[0]

        query: str = row[self.query_field]
        query = query.lower()
        query = query.replace(" ", "+")

        search_response = get(
            f"{self.esearch_api_url}?"
            f"db=pubmed&term={query}&retmax={self.num_results}"
        )
        search_root = fromstring(search_response.text)
        search_id_list = search_root.find("IdList")
        search_ids: list[str] = [
            element.text.strip() for element in search_id_list.findall("Id")
        ]
        logger.debug(f"Found {len(search_ids)} articles for '{query}'.")

        assert len(search_ids) <= self.num_results

        search_ids_string = ",".join(search_ids)
        fetch_response = get(
            f"{self.efetch_api_url}?" f"db=pubmed&id={search_ids_string}&retmode=xml"
        )
        fetch_root = fromstring(fetch_response.text)
        fetch_articles: list[Element]
        if fetch_root.tag == "PubmedArticleSet":
            fetch_articles = [
                (
                    element.find("BookDocument").find("Book")
                    if element.find("BookDocument") is not None
                    else element.find("MedlineCitation").find("Article")
                )
                for element in fetch_root
            ]
        elif fetch_root.tag == "eFetchResult":
            fetch_articles = []
        else:
            raise Exception(f"Unexpected root tag '{fetch_root.tag}'.")
        logger.debug(
            f"Found {len(fetch_articles)} article texts "
            f"for ids '{search_ids_string}'."
        )

        assert len(search_ids) == len(fetch_articles)

        results: list[dict[str, Any]] = []
        for i, (doc_id, article) in enumerate(zip(search_ids, fetch_articles)):
            title_text = (
                article.find("BookTitle").text
                if article.find("BookTitle") is not None
                else article.find("ArticleTitle").text
            )
            title = title_text.strip() if title_text is not None else ""
            abstract_element = article.find("Abstract")
            abstract_texts = (
                (text.text for text in abstract_element.findall("AbstractText"))
                if abstract_element is not None
                else []
            )
            abstract = " ".join(
                [text.strip() for text in abstract_texts if text is not None]
            )
            results.append(
                {
                    **row,
                    "docno": doc_id,
                    "score": len(results) - i,
                    "rank": i + 1,
                    "title": title,
                    "text": abstract,
                    "url": f"https://pubmed.ncbi.nlm.nih.gov/{doc_id}/",
                }
            )

        return DataFrame(results)

    def transform(self, topics: DataFrame) -> DataFrame:
        if not {"qid", "query"}.issubset(topics.columns):
            raise RuntimeError("Needs qid and query columns.")

        if len(topics) == 0:
            return self._transform_query(topics)

        topics_by_query: DataFrameGroupBy = topics.groupby(
            by=["qid"],
            as_index=False,
            sort=False,
        )
        if self.verbose:
            # Show progress during reranking queries.
            tqdm.pandas(
                desc="Searching with PubMed API",
                unit="query",
            )
            topics_by_query = topics_by_query.progress_apply(self._transform_query)
        else:
            topics_by_query = topics_by_query.apply(self._transform_query)

        retrieved: DataFrame = topics_by_query.reset_index(drop=True)
        return retrieved

In [2]:
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=10)

In [3]:
topics = DataFrame({"qid": [1], "query": ["Do enterococci cause pneumonia?"]})

retrieved = retrieve_abstracts.transform(topics)

Searching with PubMed API:   0%|          | 0/1 [00:00<?, ?query/s]

In [4]:
retrieved

Unnamed: 0,qid,query,docno,score,rank,title,text,url
0,1,Do enterococci cause pneumonia?,38488375,0,1,Spectrum and antibiotic resistance in communit...,It is important to note that the causative age...,https://pubmed.ncbi.nlm.nih.gov/38488375/
1,1,Do enterococci cause pneumonia?,38354989,0,2,"The antimicrobial property of JY-1, a complex ...","The substantial increase of infections, caused...",https://pubmed.ncbi.nlm.nih.gov/38354989/
2,1,Do enterococci cause pneumonia?,38188241,0,3,An insight into genes responsible for fosfomyc...,Asymptomatic bacteriuria (ASB) is a common fin...,https://pubmed.ncbi.nlm.nih.gov/38188241/
3,1,Do enterococci cause pneumonia?,38092626,0,4,Shift in risk factors for mortality by period ...,This study was designed to determine changes i...,https://pubmed.ncbi.nlm.nih.gov/38092626/
4,1,Do enterococci cause pneumonia?,37843115,0,5,Priorities and Progress in Gram-positive Bacte...,The Antibacterial Resistance Leadership Group ...,https://pubmed.ncbi.nlm.nih.gov/37843115/
5,1,Do enterococci cause pneumonia?,37700799,0,6,Patterns of Drug Resistance and Bacterial Path...,Urinary tract infections (UTIs) and the antibi...,https://pubmed.ncbi.nlm.nih.gov/37700799/
6,1,Do enterococci cause pneumonia?,37629701,0,7,Epidemiology and Antimicrobial Resistance Patt...,,https://pubmed.ncbi.nlm.nih.gov/37629701/
7,1,Do enterococci cause pneumonia?,37533663,0,8,Isolation and Detection of Drug-Resistant Bact...,"Surgical site infections (SSIs), especially wh...",https://pubmed.ncbi.nlm.nih.gov/37533663/
8,1,Do enterococci cause pneumonia?,30020605,0,9,StatPearls,,https://pubmed.ncbi.nlm.nih.gov/30020605/
9,1,Do enterococci cause pneumonia?,37451958,0,10,Impact of coronavirus disease 2019 (COVID-19) ...,Coronavirus disease 2019 (COVID-19) has caused...,https://pubmed.ncbi.nlm.nih.gov/37451958/


In [23]:
print(retrieved.columns)
print(retrieved.iloc[0].title)
print(retrieved.iloc[0].text)

Index(['qid', 'query', 'docno', 'score', 'rank', 'title', 'text', 'url'], dtype='object')
Spectrum and antibiotic resistance in community- and hospital-acquired urinary tract infections among adults: Experience from a large tertiary care center in a developing country.
It is important to note that the causative agents and patterns of antibiotic resistance vary between urinary tract infections (UTIs) acquired in the community and those acquired in a hospital setting. Therefore, the aim of this study was to compare the types of organisms and patterns of antibiotic resistance in adult patients with community-acquired urinary tract infections (CA-UTIs) and hospital-acquired urinary tract infections (HA-UTIs). Retrospectively, we collected urine samples from patients at An-Najah National University Hospital who experienced nonrecurring urinary tract infections (UTIs) between January 2019 and December 2020. The data were subsequently analyzed using IBM-SPSS A total of 798 nonrepetitive UTI p

In [5]:
import logging as logger

from dataclasses import dataclass, field
from typing import Optional, Dict, Any, List
from xml.etree.ElementTree import fromstring, Element

from pandas import DataFrame
from pandas.core.groupby import DataFrameGroupBy
from pyterrier.transformer import Transformer
from requests import get
from tqdm.auto import tqdm

from mibi.modules import DocumentsModule, Question


@dataclass
class PubMedApiRetrieve(DocumentsModule):
    name = "PubMedApiRetrieve"

    eutils_api_base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    esearch_api_url = f"{eutils_api_base_url}/esearch.fcgi"
    efetch_api_url = f"{eutils_api_base_url}/efetch.fcgi"

    # query_field: str = "query"
    num_results: Optional[int] = 10
    verbose: bool = field(repr=False, default=False)

    def _transform_query(self, question: Question) -> DataFrame:
        # if len(topic.index) != 1:
        #     raise RuntimeError("Can only transform one query at a time.")

        # row: Dict[str, Any] = topic.to_dict(orient="records")[0]

        query: str = question.body
        query = query.lower()
        query = query.replace(" ", "+")

        search_response = get(
            f"{self.esearch_api_url}?"
            f"db=pubmed&term={query}&retmax={self.num_results}"
        )
        search_root = fromstring(search_response.text)
        search_id_list = search_root.find("IdList")
        search_ids: list[str] = [
            element.text.strip() for element in search_id_list.findall("Id")
        ]
        logger.debug(f"Found {len(search_ids)} articles for '{query}'.")

        assert len(search_ids) <= self.num_results

        search_ids_string = ",".join(search_ids)
        fetch_response = get(
            f"{self.efetch_api_url}?" f"db=pubmed&id={search_ids_string}&retmode=xml"
        )
        fetch_root = fromstring(fetch_response.text)
        fetch_articles: list[Element]
        if fetch_root.tag == "PubmedArticleSet":
            fetch_articles = [
                (
                    element.find("BookDocument").find("Book")
                    if element.find("BookDocument") is not None
                    else element.find("MedlineCitation").find("Article")
                )
                for element in fetch_root
            ]
        elif fetch_root.tag == "eFetchResult":
            fetch_articles = []
        else:
            raise Exception(f"Unexpected root tag '{fetch_root.tag}'.")
        logger.debug(
            f"Found {len(fetch_articles)} article texts "
            f"for ids '{search_ids_string}'."
        )

        assert len(search_ids) == len(fetch_articles)

        results: list[dict[str, Any]] = []
        for i, (doc_id, article) in enumerate(zip(search_ids, fetch_articles)):
            title_text = (
                article.find("BookTitle").text
                if article.find("BookTitle") is not None
                else article.find("ArticleTitle").text
            )
            title = title_text.strip() if title_text is not None else ""
            abstract_element = article.find("Abstract")
            abstract_texts = (
                (text.text for text in abstract_element.findall("AbstractText"))
                if abstract_element is not None
                else []
            )
            abstract = " ".join(
                [text.strip() for text in abstract_texts if text is not None]
            )
            results.append(
                {
                    # **row,
                    "docno": doc_id,
                    "score": len(results) - i,
                    "rank": i + 1,
                    "title": title,
                    "text": abstract,
                    "url": f"https://pubmed.ncbi.nlm.nih.gov/{doc_id}/",
                }
            )

        return DataFrame(results)

    def transform(self, questions: List[Question]) -> DataFrame:
        # if not {"qid", "query"}.issubset(topics.columns):
        #     raise RuntimeError("Needs qid and query columns.")

        # if len(questions) == 0:
        #     return self._transform_query(topics)

        # topics_by_query: DataFrameGroupBy = topics.groupby(
        #     by=["qid"],
        #     as_index=False,
        #     sort=False,
        # )
        # if self.verbose:
        #     # Show progress during reranking queries.
        #     tqdm.pandas(
        #         desc="Searching with PubMed API",
        #         unit="query",
        #     )
        #     topics_by_query = topics_by_query.progress_apply(self._transform_query)
        # else:
        #     topics_by_query = topics_by_query.apply(self._transform_query)

        # retrieved: DataFrame = topics_by_query.reset_index(drop=True)
        for question in questions:
            retrieved = self._transform_query(question)
        return retrieved

In [8]:
questions = Question(
    id="55031181e9bde69634000014",
    type="summary",
    body="Is Hirschsprung disease a mendelian or a multifactorial disorder?",
)
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=10)
retrieved = retrieve_abstracts.transform([questions])

In [9]:
retrieved

Unnamed: 0,docno,score,rank,title,text,url
0,38473229,0,1,"Implications in Cancer of Nuclear Micro RNAs, ...",The eukaryotic genome is mainly transcribed in...,https://pubmed.ncbi.nlm.nih.gov/38473229/
1,38397019,0,2,Visfatin Affects the Transcriptome of Porcine ...,"Visfatin/NAMPT (VIS), the hormone exerting a p...",https://pubmed.ncbi.nlm.nih.gov/38397019/
2,38393342,0,3,ESRG regulates alternative splicing of TCF3 to...,Exploring the mechanism of self-renewal and pl...,https://pubmed.ncbi.nlm.nih.gov/38393342/
3,38383479,0,4,A tumor suppressor protein encoded by circKEAP...,Osteosarcoma (OS) is one of most commonly diag...,https://pubmed.ncbi.nlm.nih.gov/38383479/
4,38381315,0,5,Predicting circRNA-RBP Binding Sites Using a H...,Circular RNAs (circRNAs) are non-coding RNAs g...,https://pubmed.ncbi.nlm.nih.gov/38381315/
5,38376423,0,6,Statement of Retraction: Long non-coding RNA m...,,https://pubmed.ncbi.nlm.nih.gov/38376423/
6,38364571,0,7,Long noncoding RNA ABHD11-AS1 interacts with S...,Hexavalent chromium [Cr(VI)] is a common envir...,https://pubmed.ncbi.nlm.nih.gov/38364571/
7,38341138,0,8,LncRNA-mediated orchestrations of alternative ...,Alternative splicing (AS) is a fundamental pos...,https://pubmed.ncbi.nlm.nih.gov/38341138/
8,38309019,0,9,Long non-coding RNA (lncRNA) PVT1 in drug resi...,"According to estimates, cancer will be the lea...",https://pubmed.ncbi.nlm.nih.gov/38309019/
9,38294334,0,10,"Multiple strategies, including 6mA methylation...","Alternative splicing (AS), an important post-t...",https://pubmed.ncbi.nlm.nih.gov/38294334/
