In [10]:
import logging as logger

from dataclasses import dataclass, field
from typing import Optional, Dict, Any
from xml.etree.ElementTree import fromstring, Element

from pandas import DataFrame
from pandas.core.groupby import DataFrameGroupBy
from pyterrier.transformer import Transformer
from requests import get
from tqdm.auto import tqdm


@dataclass
class PubMedApiRetrieve:
    name = "PubMedApiRetrieve"

    eutils_api_base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    esearch_api_url = f"{eutils_api_base_url}/esearch.fcgi"
    efetch_api_url = f"{eutils_api_base_url}/efetch.fcgi"

    query_field: str = "query"
    num_results: Optional[int] = 10
    verbose: bool = field(repr=False, default=False)

    def _transform_query(self, topic: DataFrame) -> DataFrame:
        if len(topic.index) != 1:
            raise RuntimeError("Can only transform one query at a time.")

        row: Dict[str, Any] = topic.to_dict(orient="records")[0]

        query: str = row[self.query_field]
        query = query.lower()
        query = query.replace(" ", "+")

        search_response = get(
            f"{self.esearch_api_url}?"
            f"db=pubmed&term={query}&retmax={self.num_results}"
        )
        search_root = fromstring(search_response.text)
        search_id_list = search_root.find("IdList")
        search_ids: list[str] = [
            element.text.strip() for element in search_id_list.findall("Id")
        ]
        logger.debug(f"Found {len(search_ids)} articles for '{query}'.")

        assert len(search_ids) <= self.num_results

        search_ids_string = ",".join(search_ids)
        fetch_response = get(
            f"{self.efetch_api_url}?" f"db=pubmed&id={search_ids_string}&retmode=xml"
        )
        fetch_root = fromstring(fetch_response.text)
        fetch_articles: list[Element]
        if fetch_root.tag == "PubmedArticleSet":
            fetch_articles = [
                (
                    element.find("BookDocument").find("Book")
                    if element.find("BookDocument") is not None
                    else element.find("MedlineCitation").find("Article")
                )
                for element in fetch_root
            ]
        elif fetch_root.tag == "eFetchResult":
            fetch_articles = []
        else:
            raise Exception(f"Unexpected root tag '{fetch_root.tag}'.")
        logger.debug(
            f"Found {len(fetch_articles)} article texts "
            f"for ids '{search_ids_string}'."
        )

        assert len(search_ids) == len(fetch_articles)

        results: list[dict[str, Any]] = []
        for i, (doc_id, article) in enumerate(zip(search_ids, fetch_articles)):
            title_text = (
                article.find("BookTitle").text
                if article.find("BookTitle") is not None
                else article.find("ArticleTitle").text
            )
            title = title_text.strip() if title_text is not None else ""
            abstract_element = article.find("Abstract")
            abstract_texts = (
                (text.text for text in abstract_element.findall("AbstractText"))
                if abstract_element is not None
                else []
            )
            abstract = " ".join(
                [text.strip() for text in abstract_texts if text is not None]
            )
            results.append(
                {
                    **row,
                    "docno": doc_id,
                    "score": len(results) - i,
                    "rank": i + 1,
                    "title": title,
                    "text": abstract,
                    "url": f"https://pubmed.ncbi.nlm.nih.gov/{doc_id}/",
                }
            )

        return DataFrame(results)

    def transform(self, topics: DataFrame) -> DataFrame:
        if not {"qid", "query"}.issubset(topics.columns):
            raise RuntimeError("Needs qid and query columns.")

        if len(topics) == 0:
            return self._transform_query(topics)

        topics_by_query: DataFrameGroupBy = topics.groupby(
            by=["qid"],
            as_index=False,
            sort=False,
        )
        if self.verbose:
            # Show progress during reranking queries.
            tqdm.pandas(
                desc="Searching with PubMed API",
                unit="query",
            )
            topics_by_query = topics_by_query.progress_apply(self._transform_query)
        else:
            topics_by_query = topics_by_query.apply(self._transform_query)

        retrieved: DataFrame = topics_by_query.reset_index(drop=True)
        return retrieved

In [11]:
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=10)

In [13]:
topics = DataFrame({"qid": [1], "query": ["cancer"]})

retrieved = retrieve_abstracts.transform(topics)

Searching with PubMed API:   0%|          | 0/1 [00:00<?, ?query/s]

In [17]:
print(retrieved.columns)
print(retrieved.iloc[0].title)
print(retrieved.iloc[0].text)

Index(['qid', 'query', 'docno', 'score', 'rank', 'title', 'text', 'url'], dtype='object')
COVID-19 vaccination in cancer patients: Immune responses one year after the third dose.
Cancer patients (CPs), being immunosuppressed due to the treatment received or to the disease itself, are more susceptible to infections and their potential complications, showing therefore an increased risk of developing severe COVID-19 compared to the general population. We evaluated the immune responses to anti-SARS-CoV-2 vaccination in patients with solid tumors one year after the administration of the third dose and the effect of cancer treatment on vaccine immunogenicity was assessed. Healthy donors (HDs) were enrolled. Binding and neutralizing antibody (Ab) titers were evaluated using chemiluminescence immunoassay (CLIA) and Plaque Reduction Neutralization Test (PRNT) respectively. T-cell response was analyzed using multiparametric flow cytometry. CPs who were administered three vaccine doses showed low