manubot#337 Add timeout to doi.

This could not be done without touching other modules, as there are calls to functions from other modules. (Those modules are tightly coupled.)
xihh87 · Jun 15, 2022 · 20d6012 · 20d6012
1 parent 4641de8
commit 20d6012
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 39 deletions.
diff --git a/manubot/cite/doi.py b/manubot/cite/doi.py
@@ -61,7 +61,7 @@ def get_csl_item(self, citekey):
         return get_doi_csl_item(citekey.standard_accession)
 
 
-def expand_short_doi(short_doi: str) -> str:
+def expand_short_doi(short_doi: str, timeout: int = 3) -> str:
     """
     Convert a shortDOI to a regular DOI.
     """
@@ -71,7 +71,7 @@ def expand_short_doi(short_doi: str) -> str:
         )
     url = f"https://doi.org/api/handles/{short_doi.lower()}"
     params = {"type": "HS_ALIAS"}
-    response = requests.get(url, params=params)
+    response = requests.get(url, params=params, timeout=timeout)
     # response documentation at https://www.handle.net/proxy_servlet.html
     results = response.json()
     response_code = results.get("responseCode")  # Handle protocol response code
@@ -97,15 +97,15 @@ def expand_short_doi(short_doi: str) -> str:
     )
 
 
-def get_short_doi_url(doi: str) -> Optional[str]:
+def get_short_doi_url(doi: str, timeout: int = 3) -> Optional[str]:
     """
     Get the shortDOI URL for a DOI.
     """
     quoted_doi = urllib.request.quote(doi)
     url = f"http://shortdoi.org/{quoted_doi}?format=json"
     headers = {"User-Agent": get_manubot_user_agent()}
     try:
-        response = requests.get(url, headers=headers).json()
+        response = requests.get(url, headers=headers, timeout=timeout).json()
         short_doi = response["ShortDOI"]
         short_url = "https://doi.org/" + short_doi[3:]  # Remove "10/" prefix
         return short_url
@@ -118,7 +118,9 @@ def get_short_doi_url(doi: str) -> Optional[str]:
 content_negotiation_url_default: str = "https://doi.org"
 
 
-def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
+def _get_doi_csl_item_negotiation(
+    doi: str, content_negotiation_url: str, timeout: int = 3
+):
     """
     Use Content Negotiation to retrieve the CSL Item metadata for a DOI.
 
@@ -134,7 +136,7 @@ def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
         "Accept": "application/vnd.citationstyles.csl+json",
         "User-Agent": get_manubot_user_agent(),
     }
-    response = requests.get(url, headers=header)
+    response = requests.get(url, headers=header, timeout=timeout)
     try:
         return response.json()
     except Exception as error:
@@ -145,59 +147,63 @@ def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
         raise error
 
 
-def get_doi_csl_item_datacite(doi: str):
+def get_doi_csl_item_datacite(doi: str, timeout: int = 3):
     """
     As of 2022-01, the DataCite Content Negotiation restricted
     service to just DataCite DOIs, and began returning 404s for Crossref DOIs.
     https://github.com/crosscite/content-negotiation/issues/104
     """
-    return _get_doi_csl_item_negotiation(doi, content_negotiation_url_datacite)
+    return _get_doi_csl_item_negotiation(
+        doi, content_negotiation_url_datacite, timeout=timeout
+    )
 
 
-def get_doi_csl_item_default(doi: str):
+def get_doi_csl_item_default(doi: str, timeout: int = 3):
     """
     doi.org content negotiation redirects to the content negotiation service of
     the Registration Agency, e.g. Crossref or DataCite.
     https://github.com/crosscite/content-negotiation/issues/104
     """
-    return _get_doi_csl_item_negotiation(doi, content_negotiation_url_default)
+    return _get_doi_csl_item_negotiation(
+        doi, content_negotiation_url_default, timeout=timeout
+    )
 
 
-def get_doi_csl_item_zotero(doi: str):
+def get_doi_csl_item_zotero(doi: str, timeout: int = 3):
     """
     Generate CSL JSON Data for a DOI using Zotero's translation-server.
     """
     from manubot.cite.zotero import get_csl_item
 
-    return get_csl_item(f"doi:{doi}")
+    return get_csl_item(f"doi:{doi}", timeout=timeout)
 
 
-def get_doi_csl_item_url(doi: str):
+def get_doi_csl_item_url(doi: str, timeout: int = 3):
     """
     Generate CSL JSON Data for a DOI using Zotero's translation-server.
     This function converts the DOI to a URL that presumably resolves to the publisher's site.
     Zotero resolves and scrapes data from the resulting webpage.
     """
     from manubot.cite.url import get_url_csl_item_zotero
 
-    return get_url_csl_item_zotero(f"https://doi.org/{doi}")
+    return get_url_csl_item_zotero(f"https://doi.org/{doi}", timeout=timeout)
 
 
 def augment_get_doi_csl_item(function: Callable[..., Any]):
     """
     Decorator providing edits to the csl_item returned by a get_doi_csl_item_* function.
     """
 
-    def wrapper(doi: str):
+    def wrapper(doi: str, timeout: int = 3):
         doi = doi.lower()
         csl_item = function(doi)
         csl_item["DOI"] = doi
         csl_item["URL"] = f"https://doi.org/{doi}"
-        short_doi_url = get_short_doi_url(doi)
+        short_doi_url = get_short_doi_url(doi, timeout=timeout)
         if short_doi_url:
             csl_item["URL"] = short_doi_url
         try:
-            csl_item.update(get_pubmed_ids_for_doi(doi))
+            csl_item.update(get_pubmed_ids_for_doi(doi, timeout=timeout))
         except Exception:
             logging.warning(
                 f"Error calling get_pubmed_ids_for_doi for {doi}", exc_info=True
@@ -208,7 +214,7 @@ def wrapper(doi: str):
 
 
 @augment_get_doi_csl_item
-def get_doi_csl_item(doi: str):
+def get_doi_csl_item(doi: str, timeout: int = 3):
     """
     Generate CSL JSON Data for an DOI.
 
@@ -220,7 +226,7 @@ def get_doi_csl_item(doi: str):
     # FIXME: this function is repetitive with other get_*_csl_item functions.
     for retriever in doi_retrievers:
         try:
-            return retriever(doi)
+            return retriever(doi, timeout=timeout)
         except Exception as error:
             logging.warning(
                 f"Error in {retriever.__name__} for {doi} "

diff --git a/manubot/cite/pubmed.py b/manubot/cite/pubmed.py
@@ -282,7 +282,7 @@ def extract_publication_date_parts(article: ElementTree.Element) -> List[int]:
     return date_parts
 
 
-def get_pmcid_and_pmid_for_doi(doi: str) -> Dict[str, str]:
+def get_pmcid_and_pmid_for_doi(doi: str, timeout: int = 3) -> Dict[str, str]:
     """
     Query PMC's ID Converter API to retrieve the PMCID and PMID for a DOI.
     Does not work for DOIs that are in Pubmed but not PubMed Central.
@@ -292,7 +292,7 @@ def get_pmcid_and_pmid_for_doi(doi: str) -> Dict[str, str]:
     assert doi.startswith("10.")
     params = {"ids": doi, "tool": "manubot"}
     url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
-    response = requests.get(url, params)
+    response = requests.get(url, params, timeout=timeout)
     if not response.ok:
         logging.warning(f"Status code {response.status_code} querying {response.url}\n")
         return {}
@@ -358,14 +358,14 @@ def get_pmid_for_doi(doi: str) -> Optional[str]:
     return id_elem.text
 
 
-def get_pubmed_ids_for_doi(doi: str) -> Dict[str, str]:
+def get_pubmed_ids_for_doi(doi: str, timeout: int = 3) -> Dict[str, str]:
     """
     Return a dictionary with PMCID and PMID, if they exist, for the specified
     DOI. See https://github.com/manubot/manubot/issues/45.
     """
-    pubmed_ids = get_pmcid_and_pmid_for_doi(doi)
+    pubmed_ids = get_pmcid_and_pmid_for_doi(doi, timeout=timeout)
     if not pubmed_ids:
-        pmid = get_pmid_for_doi(doi)
+        pmid = get_pmid_for_doi(doi, timeout=timeout)
         if pmid:
             pubmed_ids["PMID"] = pmid
     return pubmed_ids

diff --git a/manubot/cite/url.py b/manubot/cite/url.py
@@ -23,11 +23,11 @@ def standardize_prefix_accession(self, accession):
             accession = f"{self.prefix_lower}:{accession}"
         return self.standard_prefix, accession
 
-    def get_csl_item(self, citekey):
-        return get_url_csl_item(citekey.standard_accession)
+    def get_csl_item(self, citekey, timeout=3):
+        return get_url_csl_item(citekey.standard_accession, timeout=timeout)
 
 
-def get_url_csl_item(url: str) -> CSLItem:
+def get_url_csl_item(url: str, timeout: int = 3) -> CSLItem:
     """
     Get csl_item for a URL trying a sequence of strategies.
 
@@ -38,7 +38,7 @@ def get_url_csl_item(url: str) -> CSLItem:
     """
     for retriever in url_retrievers:
         try:
-            return retriever(url)
+            return retriever(url, timeout=timeout)
         except Exception as error:
             logging.warning(
                 f"Error in {retriever.__name__} for {url} "
@@ -48,22 +48,22 @@ def get_url_csl_item(url: str) -> CSLItem:
     raise Exception(f"all get_url_csl_item methods failed for {url}")
 
 
-def get_url_csl_item_zotero(url: str) -> CSLItem:
+def get_url_csl_item_zotero(url: str, timeout: int = 3) -> CSLItem:
     """
     Use Zotero's translation-server to generate a CSL Item for the specified URL.
     """
     from manubot.cite.zotero import export_as_csl, web_query
 
     zotero_data = web_query(url)
-    csl_data = export_as_csl(zotero_data)
+    csl_data = export_as_csl(zotero_data, timeout=timeout)
     (csl_item,) = csl_data
     if not csl_item.get("URL"):
         # some Zotero translators don't set URL. https://github.com/manubot/manubot/issues/244
         csl_item["URL"] = url
     return csl_item
 
 
-def get_url_csl_item_greycite(url: str) -> CSLItem:
+def get_url_csl_item_greycite(url: str, timeout: int = 3) -> CSLItem:
     """
     Uses Greycite which has experiened uptime problems in the past.
     API calls seem to take at least 15 seconds. Browser requests are much
@@ -85,7 +85,10 @@ def get_url_csl_item_greycite(url: str) -> CSLItem:
         "User-Agent": get_manubot_user_agent(),
     }
     response = requests.get(
-        "http://greycite.knowledgeblog.org/json", params={"uri": url}, headers=headers
+        "http://greycite.knowledgeblog.org/json",
+        params={"uri": url},
+        headers=headers,
+        timeout=timeout,
     )
     response.raise_for_status()
     # Some Greycite responses were valid JSON besides for an error appended
@@ -97,7 +100,7 @@ def get_url_csl_item_greycite(url: str) -> CSLItem:
     return csl_item
 
 
-def get_url_csl_item_manual(url: str) -> CSLItem:
+def get_url_csl_item_manual(url: str, timeout: int = 3) -> CSLItem:
     """
     Manually create csl_item for a URL.
     """

diff --git a/manubot/cite/zotero.py b/manubot/cite/zotero.py
@@ -33,7 +33,9 @@ def web_query(url: str, timeout=3) -> ZoteroData:
     headers = {"User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain"}
     params = {"single": 1}
     api_url = f"{base_url}/web"
-    response = requests.post(api_url, params=params, headers=headers, data=str(url), timeout=timeout)
+    response = requests.post(
+        api_url, params=params, headers=headers, data=str(url), timeout=timeout
+    )
     try:
         zotero_data = response.json()
     except Exception as error:
@@ -66,7 +68,9 @@ def search_query(identifier: str, timeout=3) -> ZoteroData:
     """
     api_url = f"{base_url}/search"
     headers = {"User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain"}
-    response = requests.post(api_url, headers=headers, data=str(identifier), timeout=timeout)
+    response = requests.post(
+        api_url, headers=headers, data=str(identifier), timeout=timeout
+    )
     try:
         zotero_data = response.json()
     except Exception as error:
@@ -107,7 +111,9 @@ def export_as_csl(zotero_data: ZoteroData, timeout=3) -> CSLItems:
     api_url = f"{base_url}/export"
     params = {"format": "csljson"}
     headers = {"User-Agent": get_manubot_user_agent()}
-    response = requests.post(api_url, params=params, headers=headers, json=zotero_data, timeout=timeout)
+    response = requests.post(
+        api_url, params=params, headers=headers, json=zotero_data, timeout=timeout
+    )
     if not response.ok:
         message = f"export_as_csl: translation-server returned status code {response.status_code}"
         logging.warning(f"{message} with the following output:\n{response.text}")
@@ -120,13 +126,13 @@ def export_as_csl(zotero_data: ZoteroData, timeout=3) -> CSLItems:
     return csl_items
 
 
-def get_csl_item(identifier: str) -> CSLItem:
+def get_csl_item(identifier: str, timeout=3) -> CSLItem:
     """
     Use a translation-server search query followed by an export query
     to return a CSL Item (the first & only record of the returned CSL JSON).
     """
-    zotero_data = search_query(identifier)
-    csl_items = export_as_csl(zotero_data)
+    zotero_data = search_query(identifier, timeout=timeout)
+    csl_items = export_as_csl(zotero_data, timeout=timeout)
     (csl_item,) = csl_items
     return csl_item