Skip to content

Commit

Permalink
manubot#337 Allow users to specify timeout.
Browse files Browse the repository at this point in the history
- - -

doi -[depends]-> pubmed
doi -[depends]-> url
doi -[depends]-> zotero

- - -

wikidata -[depends]-> url
  • Loading branch information
xihh87 authored and Joshua Haase committed Feb 6, 2024
1 parent 29dec5b commit 21709b7
Show file tree
Hide file tree
Showing 18 changed files with 251 additions and 103 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ usage: manubot process [-h] --content-directory CONTENT_DIRECTORY
--skip-citations [--cache-directory CACHE_DIRECTORY]
[--clear-requests-cache] [--skip-remote]
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
[-t TIMEOUT_SECONDS]
Process manuscript content to create outputs for Pandoc consumption. Performs
bibliographic processing and templating.
Expand Down Expand Up @@ -149,6 +150,8 @@ options:
repository remotes.
--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
Set the logging level for stderr logging
-t TIMEOUT_SECONDS, --timeout TIMEOUT_SECONDS
timeout for web requests
```

#### Manual references
Expand Down Expand Up @@ -183,6 +186,7 @@ usage: manubot cite [-h] [--output OUTPUT]
[--csl CSL] [--bibliography BIBLIOGRAPHY]
[--no-infer-prefix] [--allow-invalid-csl-data]
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
[-t TIMEOUT_SECONDS]
citekeys [citekeys ...]
Generate bibliographic metadata in CSL JSON format for one or more citation
Expand Down Expand Up @@ -220,6 +224,8 @@ options:
Schema. Skips CSL pruning.
--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
Set the logging level for stderr logging
-t TIMEOUT_SECONDS, --timeout TIMEOUT_SECONDS
timeout for web requests
```

### Pandoc filter
Expand Down Expand Up @@ -273,6 +279,7 @@ The `manubot webpage` command populates a `webpage` directory with Manubot outpu
usage: manubot webpage [-h] [--checkout [CHECKOUT]] [--version VERSION]
[--timestamp] [--no-ots-cache | --ots-cache OTS_CACHE]
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
[-t TIMEOUT_SECONDS]
Update the webpage directory tree with Manubot output files. This command
should be run from the root directory of a Manubot manuscript that follows the
Expand Down Expand Up @@ -301,6 +308,8 @@ options:
ci/cache/ots).
--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
Set the logging level for stderr logging
-t TIMEOUT_SECONDS, --timeout TIMEOUT_SECONDS
timeout for web requests
```

## Development
Expand Down
46 changes: 33 additions & 13 deletions manubot/cite/arxiv.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import re
import typing as tp
import xml.etree.ElementTree

import requests
Expand All @@ -9,14 +10,15 @@
from .csl_item import CSL_Item
from .handlers import Handler

default_timeout = (3, 15)

class Handler_arXiv(Handler):

class Handler_arXiv(Handler):
standard_prefix = "arxiv"

prefixes = [
"arxiv",
]

accession_pattern = re.compile(
r"(?P<versionless_id>[0-9]{4}\.[0-9]{4,5}|[a-z\-]+(\.[A-Z]{2})?/[0-9]{7})(?P<version>v[0-9]+)?"
)
Expand All @@ -26,8 +28,14 @@ def inspect(self, citekey):
if not self._get_pattern().fullmatch(citekey.accession):
return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."

def get_csl_item(self, citekey):
return get_arxiv_csl_item(citekey.standard_accession)
def get_csl_item(
self,
citekey,
timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout,
):
return get_arxiv_csl_item(
citekey.standard_accession, timeout_seconds=default_timeout
)


class CSL_Item_arXiv(CSL_Item):
Expand Down Expand Up @@ -65,27 +73,33 @@ def split_arxiv_id_version(arxiv_id: str):
return match.group("versionless_id"), match.group("version")


def get_arxiv_csl_item(arxiv_id: str):
def get_arxiv_csl_item(
arxiv_id: str, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
):
"""
Return csl_item item for an arXiv identifier.
Chooses which arXiv API to use based on whether arxiv_id
is versioned, since only one endpoint supports versioning.
"""
_, version = split_arxiv_id_version(arxiv_id)
if version:
return get_arxiv_csl_item_export_api(arxiv_id)
return get_arxiv_csl_item_oai(arxiv_id)
return get_arxiv_csl_item_export_api(arxiv_id, timeout_seconds=timeout_seconds)
return get_arxiv_csl_item_oai(arxiv_id, timeout_seconds=timeout_seconds)


def query_arxiv_api(url, params):
def query_arxiv_api(
url, params, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
):
headers = {"User-Agent": get_manubot_user_agent()}
response = requests.get(url, params, headers=headers)
response = requests.get(url, params, headers=headers, timeout=timeout_seconds)
response.raise_for_status()
xml_tree = xml.etree.ElementTree.fromstring(response.text)
return xml_tree


def get_arxiv_csl_item_export_api(arxiv_id):
def get_arxiv_csl_item_export_api(
arxiv_id, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
):
"""
Return csl_item item for an arXiv record.
Expand All @@ -105,6 +119,7 @@ def get_arxiv_csl_item_export_api(arxiv_id):
xml_tree = query_arxiv_api(
url="https://export.arxiv.org/api/query",
params={"id_list": arxiv_id, "max_results": 1},
timeout_seconds=timeout_seconds,
)

# XML namespace prefixes
Expand Down Expand Up @@ -157,7 +172,9 @@ def get_arxiv_csl_item_export_api(arxiv_id):
return csl_item


def get_arxiv_csl_item_oai(arxiv_id):
def get_arxiv_csl_item_oai(
arxiv_id, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
):
"""
Generate a CSL Item for an unversioned arXiv identifier
using arXiv's OAI_PMH v2.0 API <https://arxiv.org/help/oa>.
Expand All @@ -174,6 +191,7 @@ def get_arxiv_csl_item_oai(arxiv_id):
"metadataPrefix": "arXiv",
"identifier": f"oai:arXiv.org:{arxiv_id}",
},
timeout_seconds=timeout_seconds,
)

# Create dictionary for CSL Item
Expand Down Expand Up @@ -237,10 +255,12 @@ def remove_newlines(text):
return re.sub(pattern=r"\n(?!\s)", repl=" ", string=text)


def get_arxiv_csl_item_zotero(arxiv_id):
def get_arxiv_csl_item_zotero(
arxiv_id, timeout_seconds: tp.Union[tuple, int, float, None] = default_timeout
):
"""
Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.
"""
from manubot.cite.zotero import get_csl_item

return get_csl_item(f"arxiv:{arxiv_id}")
return get_csl_item(f"arxiv:{arxiv_id}", timeout_seconds=timeout_seconds)
9 changes: 8 additions & 1 deletion manubot/cite/citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,18 @@ class Citations:
# `sort_csl_items=False` retains order of input_ids in get_csl_items.
# (input_ids with the same standard_id will still be deduplicated).
sort_csl_items: bool = True
# timeout for requests
timeout_seconds: tp.Union[tuple, int, float, None] = (3, 15)

def __post_init__(self):
input_ids = list(dict.fromkeys(self.input_ids)) # deduplicate
self.citekeys = [
CiteKey(x, aliases=self.aliases, infer_prefix=self.infer_citekey_prefixes)
CiteKey(
x,
aliases=self.aliases,
infer_prefix=self.infer_citekey_prefixes,
timeout_seconds=self.timeout_seconds,
)
for x in input_ids
]

Expand Down
1 change: 1 addition & 0 deletions manubot/cite/cite_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def cli_cite(args: argparse.Namespace):
infer_citekey_prefixes=args.infer_prefix,
prune_csl_items=args.prune_csl,
sort_csl_items=False,
timeout_seconds=args.timeout_seconds,
)
citations.load_manual_references(paths=args.bibliography)
citations.inspect(log_level="WARNING")
Expand Down
4 changes: 3 additions & 1 deletion manubot/cite/citekey.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ class CiteKey:
"""Mapping from input identifier to aliases"""
infer_prefix: bool = True
"""Whether to infer the citekey's prefix when a prefix is missing or unhandled"""
timeout_seconds: tp.Union[tuple, int, float, None] = (3, 12)
"""The time to wait when making requests"""

def __post_init__(self):
self.check_input_id(self.input_id)
Expand Down Expand Up @@ -208,7 +210,7 @@ def __repr__(self):
def csl_item(self):
from .csl_item import CSL_Item

csl_item = self.handler.get_csl_item(self)
csl_item = self.handler.get_csl_item(self, self.timeout_seconds)
if not isinstance(csl_item, CSL_Item):
csl_item = CSL_Item(csl_item)
csl_item.set_id(self.standard_id)
Expand Down
12 changes: 8 additions & 4 deletions manubot/cite/curie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"synonyms",
}


default_timeout = (3, 15)
bioregistry_path = pathlib.Path(__file__).parent.joinpath("bioregistry.json")


Expand Down Expand Up @@ -78,11 +78,15 @@ def __post_init__(self):
if "pattern" in self.resource:
self.accession_pattern = self.resource["pattern"]

def get_csl_item(self, citekey: CiteKey):
def get_csl_item(
self,
citekey: CiteKey,
timeout_seconds: typing.Union[tuple, int, float, None] = default_timeout,
):
from ..url import get_url_csl_item

url = self.get_url(accession=citekey.standard_accession)
return get_url_csl_item(url)
return get_url_csl_item(url, timeout_seconds=timeout_seconds)

def inspect(self, citekey: CiteKey) -> typing.Optional[str]:
pattern = self._get_pattern("accession_pattern")
Expand Down Expand Up @@ -111,7 +115,7 @@ def _download_bioregistry() -> None:
import requests

url = "https://github.com/biopragmatics/bioregistry/raw/main/exports/registry/registry.json"
response = requests.get(url)
response = requests.get(url, timeout=default_timeout)
response.raise_for_status()
results = response.json()
assert isinstance(results, dict)
Expand Down
Loading

0 comments on commit 21709b7

Please sign in to comment.