manubot#337 Allow users to specify timeout.

- - - doi -[depends]-> pubmed doi -[depends]-> url doi -[depends]-> zotero - - - wikidata -[depends]-> url
xihh87 · Feb 6, 2024 · 21709b7 · 21709b7
1 parent 29dec5b
commit 21709b7
Show file tree

Hide file tree

Showing 18 changed files with 251 additions and 103 deletions.
diff --git a/README.md b/README.md
@@ -111,6 +111,7 @@ usage: manubot process [-h] --content-directory CONTENT_DIRECTORY
                        --skip-citations [--cache-directory CACHE_DIRECTORY]
                        [--clear-requests-cache] [--skip-remote]
                        [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
+                       [-t TIMEOUT_SECONDS]
 
 Process manuscript content to create outputs for Pandoc consumption. Performs
 bibliographic processing and templating.
@@ -149,6 +150,8 @@ options:
                         repository remotes.
   --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
                         Set the logging level for stderr logging
+  -t TIMEOUT_SECONDS, --timeout TIMEOUT_SECONDS
+                        timeout for web requests
 ```
 
 #### Manual references
@@ -183,6 +186,7 @@ usage: manubot cite [-h] [--output OUTPUT]
                     [--csl CSL] [--bibliography BIBLIOGRAPHY]
                     [--no-infer-prefix] [--allow-invalid-csl-data]
                     [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
+                    [-t TIMEOUT_SECONDS]
                     citekeys [citekeys ...]
 
 Generate bibliographic metadata in CSL JSON format for one or more citation
@@ -220,6 +224,8 @@ options:
                         Schema. Skips CSL pruning.
   --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
                         Set the logging level for stderr logging
+  -t TIMEOUT_SECONDS, --timeout TIMEOUT_SECONDS
+                        timeout for web requests
 ```
 
 ### Pandoc filter
@@ -273,6 +279,7 @@ The `manubot webpage` command populates a `webpage` directory with Manubot outpu
 usage: manubot webpage [-h] [--checkout [CHECKOUT]] [--version VERSION]
                        [--timestamp] [--no-ots-cache | --ots-cache OTS_CACHE]
                        [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
+                       [-t TIMEOUT_SECONDS]
 
 Update the webpage directory tree with Manubot output files. This command
 should be run from the root directory of a Manubot manuscript that follows the
@@ -301,6 +308,8 @@ options:
                         ci/cache/ots).
   --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
                         Set the logging level for stderr logging
+  -t TIMEOUT_SECONDS, --timeout TIMEOUT_SECONDS
+                        timeout for web requests
 ```
 
 ## Development

diff --git a/manubot/cite/arxiv.py b/manubot/cite/arxiv.py
@@ -1,5 +1,6 @@
 import logging
 import re
+import typing as tp
 import xml.etree.ElementTree
 
 import requests
@@ -9,14 +10,15 @@
 from .csl_item import CSL_Item
 from .handlers import Handler
 
+default_timeout = (3, 15)
 
-class Handler_arXiv(Handler):
 
+class Handler_arXiv(Handler):
     standard_prefix = "arxiv"
-
     prefixes = [
         "arxiv",
     ]
+
     accession_pattern = re.compile(
         r"(?P<versionless_id>[0-9]{4}\.[0-9]{4,5}|[a-z\-]+(\.[A-Z]{2})?/[0-9]{7})(?P<version>v[0-9]+)?"
     )
@@ -26,8 +28,14 @@ def inspect(self, citekey):
         if not self._get_pattern().fullmatch(citekey.accession):
             return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."
 
-    def get_csl_item(self, citekey):
-        return get_arxiv_csl_item(citekey.standard_accession)
+    def get_csl_item(
+        self,
+        citekey,
+        timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout,
+    ):
+        return get_arxiv_csl_item(
+            citekey.standard_accession, timeout_seconds=default_timeout
+        )
 
 
 class CSL_Item_arXiv(CSL_Item):
@@ -65,27 +73,33 @@ def split_arxiv_id_version(arxiv_id: str):
     return match.group("versionless_id"), match.group("version")
 
 
-def get_arxiv_csl_item(arxiv_id: str):
+def get_arxiv_csl_item(
+    arxiv_id: str, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
+):
     """
     Return csl_item item for an arXiv identifier.
     Chooses which arXiv API to use based on whether arxiv_id
     is versioned, since only one endpoint supports versioning.
     """
     _, version = split_arxiv_id_version(arxiv_id)
     if version:
-        return get_arxiv_csl_item_export_api(arxiv_id)
-    return get_arxiv_csl_item_oai(arxiv_id)
+        return get_arxiv_csl_item_export_api(arxiv_id, timeout_seconds=timeout_seconds)
+    return get_arxiv_csl_item_oai(arxiv_id, timeout_seconds=timeout_seconds)
 
 
-def query_arxiv_api(url, params):
+def query_arxiv_api(
+    url, params, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
+):
     headers = {"User-Agent": get_manubot_user_agent()}
-    response = requests.get(url, params, headers=headers)
+    response = requests.get(url, params, headers=headers, timeout=timeout_seconds)
     response.raise_for_status()
     xml_tree = xml.etree.ElementTree.fromstring(response.text)
     return xml_tree
 
 
-def get_arxiv_csl_item_export_api(arxiv_id):
+def get_arxiv_csl_item_export_api(
+    arxiv_id, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
+):
     """
     Return csl_item item for an arXiv record.
 
@@ -105,6 +119,7 @@ def get_arxiv_csl_item_export_api(arxiv_id):
     xml_tree = query_arxiv_api(
         url="https://export.arxiv.org/api/query",
         params={"id_list": arxiv_id, "max_results": 1},
+        timeout_seconds=timeout_seconds,
     )
 
     # XML namespace prefixes
@@ -157,7 +172,9 @@ def get_arxiv_csl_item_export_api(arxiv_id):
     return csl_item
 
 
-def get_arxiv_csl_item_oai(arxiv_id):
+def get_arxiv_csl_item_oai(
+    arxiv_id, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
+):
     """
     Generate a CSL Item for an unversioned arXiv identifier
     using arXiv's OAI_PMH v2.0 API <https://arxiv.org/help/oa>.
@@ -174,6 +191,7 @@ def get_arxiv_csl_item_oai(arxiv_id):
             "metadataPrefix": "arXiv",
             "identifier": f"oai:arXiv.org:{arxiv_id}",
         },
+        timeout_seconds=timeout_seconds,
     )
 
     # Create dictionary for CSL Item
@@ -237,10 +255,12 @@ def remove_newlines(text):
     return re.sub(pattern=r"\n(?!\s)", repl=" ", string=text)
 
 
-def get_arxiv_csl_item_zotero(arxiv_id):
+def get_arxiv_csl_item_zotero(
+    arxiv_id, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
+):
     """
     Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.
     """
     from manubot.cite.zotero import get_csl_item
 
-    return get_csl_item(f"arxiv:{arxiv_id}")
+    return get_csl_item(f"arxiv:{arxiv_id}", timeout_seconds=timeout_seconds)
diff --git a/manubot/cite/citations.py b/manubot/cite/citations.py
@@ -31,11 +31,18 @@ class Citations:
     # `sort_csl_items=False` retains order of input_ids in get_csl_items.
     # (input_ids with the same standard_id will still be deduplicated).
     sort_csl_items: bool = True
+    # timeout for requests
+    timeout_seconds: tp.Union[tuple, int, float, None] = (3, 15)
 
     def __post_init__(self):
         input_ids = list(dict.fromkeys(self.input_ids))  # deduplicate
         self.citekeys = [
-            CiteKey(x, aliases=self.aliases, infer_prefix=self.infer_citekey_prefixes)
+            CiteKey(
+                x,
+                aliases=self.aliases,
+                infer_prefix=self.infer_citekey_prefixes,
+                timeout_seconds=self.timeout_seconds,
+            )
             for x in input_ids
         ]
 

diff --git a/manubot/cite/cite_command.py b/manubot/cite/cite_command.py
@@ -94,6 +94,7 @@ def cli_cite(args: argparse.Namespace):
         infer_citekey_prefixes=args.infer_prefix,
         prune_csl_items=args.prune_csl,
         sort_csl_items=False,
+        timeout_seconds=args.timeout_seconds,
     )
     citations.load_manual_references(paths=args.bibliography)
     citations.inspect(log_level="WARNING")

diff --git a/manubot/cite/citekey.py b/manubot/cite/citekey.py
@@ -21,6 +21,8 @@ class CiteKey:
     """Mapping from input identifier to aliases"""
     infer_prefix: bool = True
     """Whether to infer the citekey's prefix when a prefix is missing or unhandled"""
+    timeout_seconds: tp.Union[tuple, int, float, None] = (3, 12)
+    """The time to wait when making requests"""
 
     def __post_init__(self):
         self.check_input_id(self.input_id)
@@ -208,7 +210,7 @@ def __repr__(self):
     def csl_item(self):
         from .csl_item import CSL_Item
 
-        csl_item = self.handler.get_csl_item(self)
+        csl_item = self.handler.get_csl_item(self, self.timeout_seconds)
         if not isinstance(csl_item, CSL_Item):
             csl_item = CSL_Item(csl_item)
         csl_item.set_id(self.standard_id)

diff --git a/manubot/cite/curie/__init__.py b/manubot/cite/curie/__init__.py
@@ -50,7 +50,7 @@
     "synonyms",
 }
 
-
+default_timeout = (3, 15)
 bioregistry_path = pathlib.Path(__file__).parent.joinpath("bioregistry.json")
 
 
@@ -78,11 +78,15 @@ def __post_init__(self):
         if "pattern" in self.resource:
             self.accession_pattern = self.resource["pattern"]
 
-    def get_csl_item(self, citekey: CiteKey):
+    def get_csl_item(
+        self,
+        citekey: CiteKey,
+        timeout_seconds: typing.Union[tuple, int, float, None] = default_timeout,
+    ):
         from ..url import get_url_csl_item
 
         url = self.get_url(accession=citekey.standard_accession)
-        return get_url_csl_item(url)
+        return get_url_csl_item(url, timeout_seconds=timeout_seconds)
 
     def inspect(self, citekey: CiteKey) -> typing.Optional[str]:
         pattern = self._get_pattern("accession_pattern")
@@ -111,7 +115,7 @@ def _download_bioregistry() -> None:
     import requests
 
     url = "https://github.com/biopragmatics/bioregistry/raw/main/exports/registry/registry.json"
-    response = requests.get(url)
+    response = requests.get(url, timeout=default_timeout)
     response.raise_for_status()
     results = response.json()
     assert isinstance(results, dict)