Skip to content

Commit

Permalink
manubot#337 Add timeout to doi.
Browse files Browse the repository at this point in the history
This could not be done without touching other modules,
as there are calls to functions from other modules.

doi -[depends]-> pubmed
doi -[depends]-> url
doi -[depends]-> zotero
  • Loading branch information
xihh87 committed Jun 17, 2022
1 parent ed74089 commit 68a7276
Showing 1 changed file with 33 additions and 19 deletions.
52 changes: 33 additions & 19 deletions manubot/cite/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from .handlers import Handler
from .pubmed import get_pubmed_ids_for_doi

default_timeout = 3


class Handler_DOI(Handler):

Expand Down Expand Up @@ -61,7 +63,7 @@ def get_csl_item(self, citekey):
return get_doi_csl_item(citekey.standard_accession)


def expand_short_doi(short_doi: str) -> str:
def expand_short_doi(short_doi: str, timeout_seconds: int = default_timeout) -> str:
"""
Convert a shortDOI to a regular DOI.
"""
Expand All @@ -71,7 +73,7 @@ def expand_short_doi(short_doi: str) -> str:
)
url = f"https://doi.org/api/handles/{short_doi.lower()}"
params = {"type": "HS_ALIAS"}
response = requests.get(url, params=params)
response = requests.get(url, params=params, timeout=timeout_seconds)
# response documentation at https://www.handle.net/proxy_servlet.html
results = response.json()
response_code = results.get("responseCode") # Handle protocol response code
Expand All @@ -97,15 +99,17 @@ def expand_short_doi(short_doi: str) -> str:
)


def get_short_doi_url(doi: str) -> Optional[str]:
def get_short_doi_url(
doi: str, timeout_seconds: int = default_timeout
) -> Optional[str]:
"""
Get the shortDOI URL for a DOI.
"""
quoted_doi = urllib.request.quote(doi)
url = f"http://shortdoi.org/{quoted_doi}?format=json"
headers = {"User-Agent": get_manubot_user_agent()}
try:
response = requests.get(url, headers=headers).json()
response = requests.get(url, headers=headers, timeout=timeout_seconds).json()
short_doi = response["ShortDOI"]
short_url = "https://doi.org/" + short_doi[3:] # Remove "10/" prefix
return short_url
Expand All @@ -118,7 +122,9 @@ def get_short_doi_url(doi: str) -> Optional[str]:
content_negotiation_url_default: str = "https://doi.org"


def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
def _get_doi_csl_item_negotiation(
doi: str, content_negotiation_url: str, timeout_seconds: int = default_timeout
):
"""
Use Content Negotiation to retrieve the CSL Item metadata for a DOI.
Expand All @@ -134,7 +140,7 @@ def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
"Accept": "application/vnd.citationstyles.csl+json",
"User-Agent": get_manubot_user_agent(),
}
response = requests.get(url, headers=header)
response = requests.get(url, headers=header, timeout=timeout_seconds)
try:
return response.json()
except Exception as error:
Expand All @@ -145,59 +151,67 @@ def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
raise error


def get_doi_csl_item_datacite(doi: str):
def get_doi_csl_item_datacite(doi: str, timeout_seconds: int = default_timeout):
"""
As of 2022-01, the DataCite Content Negotiation restricted
service to just DataCite DOIs, and began returning 404s for Crossref DOIs.
https://github.com/crosscite/content-negotiation/issues/104
"""
return _get_doi_csl_item_negotiation(doi, content_negotiation_url_datacite)
return _get_doi_csl_item_negotiation(
doi, content_negotiation_url_datacite, timeout_seconds=timeout_seconds
)


def get_doi_csl_item_default(doi: str):
def get_doi_csl_item_default(doi: str, timeout_seconds: int = default_timeout):
"""
doi.org content negotiation redirects to the content negotiation service of
the Registration Agency, e.g. Crossref or DataCite.
https://github.com/crosscite/content-negotiation/issues/104
"""
return _get_doi_csl_item_negotiation(doi, content_negotiation_url_default)
return _get_doi_csl_item_negotiation(
doi, content_negotiation_url_default, timeout_seconds=timeout_seconds
)


def get_doi_csl_item_zotero(doi: str):
def get_doi_csl_item_zotero(doi: str, timeout_seconds: int = default_timeout):
"""
Generate CSL JSON Data for a DOI using Zotero's translation-server.
"""
from manubot.cite.zotero import get_csl_item

return get_csl_item(f"doi:{doi}")
return get_csl_item(f"doi:{doi}", timeout_seconds=timeout_seconds)


def get_doi_csl_item_url(doi: str):
def get_doi_csl_item_url(doi: str, timeout_seconds: int = default_timeout):
"""
Generate CSL JSON Data for a DOI using Zotero's translation-server.
This function converts the DOI to a URL that presumably resolves to the publisher's site.
Zotero resolves and scrapes data from the resulting webpage.
"""
from manubot.cite.url import get_url_csl_item_zotero

return get_url_csl_item_zotero(f"https://doi.org/{doi}")
return get_url_csl_item_zotero(
f"https://doi.org/{doi}", timeout_seconds=timeout_seconds
)


def augment_get_doi_csl_item(function: Callable[..., Any]):
"""
Decorator providing edits to the csl_item returned by a get_doi_csl_item_* function.
"""

def wrapper(doi: str):
def wrapper(doi: str, timeout_seconds: int = default_timeout):
doi = doi.lower()
csl_item = function(doi)
csl_item["DOI"] = doi
csl_item["URL"] = f"https://doi.org/{doi}"
short_doi_url = get_short_doi_url(doi)
short_doi_url = get_short_doi_url(doi, timeout_seconds=timeout_seconds)
if short_doi_url:
csl_item["URL"] = short_doi_url
try:
csl_item.update(get_pubmed_ids_for_doi(doi))
csl_item.update(
get_pubmed_ids_for_doi(doi, timeout_seconds=timeout_seconds)
)
except Exception:
logging.warning(
f"Error calling get_pubmed_ids_for_doi for {doi}", exc_info=True
Expand All @@ -208,7 +222,7 @@ def wrapper(doi: str):


@augment_get_doi_csl_item
def get_doi_csl_item(doi: str):
def get_doi_csl_item(doi: str, timeout_seconds: int = default_timeout):
"""
Generate CSL JSON Data for an DOI.
Expand All @@ -220,7 +234,7 @@ def get_doi_csl_item(doi: str):
# FIXME: this function is repetitive with other get_*_csl_item functions.
for retriever in doi_retrievers:
try:
return retriever(doi)
return retriever(doi, timeout_seconds=timeout_seconds)
except Exception as error:
logging.warning(
f"Error in {retriever.__name__} for {doi} "
Expand Down

0 comments on commit 68a7276

Please sign in to comment.