From 86546196557cc346e56f6d141d64f2818a138987 Mon Sep 17 00:00:00 2001 From: Joshua Haase Date: Thu, 16 Jun 2022 21:11:28 -0500 Subject: [PATCH] #337 Add timeout for pubmed search. --- manubot/cite/pubmed.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/manubot/cite/pubmed.py b/manubot/cite/pubmed.py index 475460b8..58c9d8ae 100644 --- a/manubot/cite/pubmed.py +++ b/manubot/cite/pubmed.py @@ -13,6 +13,8 @@ from .citekey import CiteKey from .handlers import Handler +default_timeout = 3 + class Handler_PubMed(Handler): @@ -59,11 +61,15 @@ def inspect(self, citekey: CiteKey) -> Optional[str]: "Double check the PMCID." ) - def get_csl_item(self, citekey: CiteKey): - return get_pmc_csl_item(citekey.standard_accession) + def get_csl_item(self, citekey: CiteKey, timeout_seconds: int = default_timeout): + return get_pmc_csl_item( + citekey.standard_accession, timeout_seconds=timeout_seconds + ) -def get_pmc_csl_item(pmcid: str) -> Dict[str, Any]: +def get_pmc_csl_item( + pmcid: str, timeout_seconds: int = default_timeout +) -> Dict[str, Any]: """ Get the CSL Item for a PubMed Central record by its PMID, PMCID, or DOI, using the NCBI Citation Exporter API. @@ -82,7 +88,7 @@ def get_pmc_csl_item(pmcid: str) -> Dict[str, Any]: def _get_literature_citation_exporter_csl_item( - database: str, identifier: str + database: str, identifier: str, timeout_seconds: int = default_timeout ) -> Dict[str, Any]: """ https://api.ncbi.nlm.nih.gov/lit/ctxp @@ -102,7 +108,7 @@ def _get_literature_citation_exporter_csl_item( params = {"format": "csl", "id": identifier} headers = {"User-Agent": get_manubot_user_agent()} url = f"https://api.ncbi.nlm.nih.gov/lit/ctxp/v1/{database}/" - response = requests.get(url, params, headers=headers) + response = requests.get(url, params, headers=headers, timeout=timeout_seconds) try: csl_item = response.json() except Exception as error: @@ -122,7 +128,9 @@ def _get_literature_citation_exporter_csl_item( return csl_item -def get_pubmed_csl_item(pmid: Union[str, int]) -> Dict[str, Any]: +def get_pubmed_csl_item( + pmid: Union[str, int], timeout_seconds: int = default_timeout +) -> Dict[str, Any]: """ Query NCBI E-Utilities to create CSL Items for PubMed IDs. @@ -134,7 +142,7 @@ def get_pubmed_csl_item(pmid: Union[str, int]) -> Dict[str, Any]: headers = {"User-Agent": get_manubot_user_agent()} url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" with _get_eutils_rate_limiter(): - response = requests.get(url, params, headers=headers) + response = requests.get(url, params, headers=headers, timeout=timeout_seconds) try: xml_article_set = ElementTree.fromstring(response.text) assert isinstance(xml_article_set, ElementTree.Element) @@ -282,7 +290,9 @@ def extract_publication_date_parts(article: ElementTree.Element) -> List[int]: return date_parts -def get_pmcid_and_pmid_for_doi(doi: str) -> Dict[str, str]: +def get_pmcid_and_pmid_for_doi( + doi: str, timeout_seconds: int = default_timeout +) -> Dict[str, str]: """ Query PMC's ID Converter API to retrieve the PMCID and PMID for a DOI. Does not work for DOIs that are in Pubmed but not PubMed Central. @@ -292,7 +302,7 @@ def get_pmcid_and_pmid_for_doi(doi: str) -> Dict[str, str]: assert doi.startswith("10.") params = {"ids": doi, "tool": "manubot"} url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" - response = requests.get(url, params) + response = requests.get(url, params, timeout=timeout_seconds) if not response.ok: logging.warning(f"Status code {response.status_code} querying {response.url}\n") return {} @@ -323,7 +333,7 @@ def get_pmcid_and_pmid_for_doi(doi: str) -> Dict[str, str]: return id_dict -def get_pmid_for_doi(doi: str) -> Optional[str]: +def get_pmid_for_doi(doi: str, timeout_seconds: int = default_timeout) -> Optional[str]: """ Query NCBI's E-utilities to retrieve the PMID for a DOI. """ @@ -333,7 +343,7 @@ def get_pmid_for_doi(doi: str) -> Optional[str]: headers = {"User-Agent": get_manubot_user_agent()} url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" with _get_eutils_rate_limiter(): - response = requests.get(url, params, headers=headers) + response = requests.get(url, params, headers=headers, timeout=timeout_seconds) if not response.ok: logging.warning(f"Status code {response.status_code} querying {response.url}\n") return None @@ -358,14 +368,16 @@ def get_pmid_for_doi(doi: str) -> Optional[str]: return id_elem.text -def get_pubmed_ids_for_doi(doi: str) -> Dict[str, str]: +def get_pubmed_ids_for_doi( + doi: str, timeout_seconds: int = default_timeout +) -> Dict[str, str]: """ Return a dictionary with PMCID and PMID, if they exist, for the specified DOI. See https://github.com/manubot/manubot/issues/45. """ - pubmed_ids = get_pmcid_and_pmid_for_doi(doi) + pubmed_ids = get_pmcid_and_pmid_for_doi(doi, timeout_seconds=timeout_seconds) if not pubmed_ids: - pmid = get_pmid_for_doi(doi) + pmid = get_pmid_for_doi(doi, timeout_seconds=timeout_seconds) if pmid: pubmed_ids["PMID"] = pmid return pubmed_ids