Skip to content

Commit

Permalink
manubot#337 Add timeout to doi.
Browse files Browse the repository at this point in the history
This could not be done without touching other modules,
as there are calls to functions from other modules.
(Those modules are tightly coupled.)
  • Loading branch information
xihh87 committed Jun 15, 2022
1 parent 4641de8 commit 20d6012
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 39 deletions.
44 changes: 25 additions & 19 deletions manubot/cite/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_csl_item(self, citekey):
return get_doi_csl_item(citekey.standard_accession)


def expand_short_doi(short_doi: str) -> str:
def expand_short_doi(short_doi: str, timeout: int = 3) -> str:
"""
Convert a shortDOI to a regular DOI.
"""
Expand All @@ -71,7 +71,7 @@ def expand_short_doi(short_doi: str) -> str:
)
url = f"https://doi.org/api/handles/{short_doi.lower()}"
params = {"type": "HS_ALIAS"}
response = requests.get(url, params=params)
response = requests.get(url, params=params, timeout=timeout)
# response documentation at https://www.handle.net/proxy_servlet.html
results = response.json()
response_code = results.get("responseCode") # Handle protocol response code
Expand All @@ -97,15 +97,15 @@ def expand_short_doi(short_doi: str) -> str:
)


def get_short_doi_url(doi: str) -> Optional[str]:
def get_short_doi_url(doi: str, timeout: int = 3) -> Optional[str]:
"""
Get the shortDOI URL for a DOI.
"""
quoted_doi = urllib.request.quote(doi)
url = f"http://shortdoi.org/{quoted_doi}?format=json"
headers = {"User-Agent": get_manubot_user_agent()}
try:
response = requests.get(url, headers=headers).json()
response = requests.get(url, headers=headers, timeout=timeout).json()
short_doi = response["ShortDOI"]
short_url = "https://doi.org/" + short_doi[3:] # Remove "10/" prefix
return short_url
Expand All @@ -118,7 +118,9 @@ def get_short_doi_url(doi: str) -> Optional[str]:
content_negotiation_url_default: str = "https://doi.org"


def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
def _get_doi_csl_item_negotiation(
doi: str, content_negotiation_url: str, timeout: int = 3
):
"""
Use Content Negotiation to retrieve the CSL Item metadata for a DOI.
Expand All @@ -134,7 +136,7 @@ def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
"Accept": "application/vnd.citationstyles.csl+json",
"User-Agent": get_manubot_user_agent(),
}
response = requests.get(url, headers=header)
response = requests.get(url, headers=header, timeout=timeout)
try:
return response.json()
except Exception as error:
Expand All @@ -145,59 +147,63 @@ def _get_doi_csl_item_negotiation(doi: str, content_negotiation_url: str):
raise error


def get_doi_csl_item_datacite(doi: str):
def get_doi_csl_item_datacite(doi: str, timeout: int = 3):
"""
As of 2022-01, the DataCite Content Negotiation restricted
service to just DataCite DOIs, and began returning 404s for Crossref DOIs.
https://github.com/crosscite/content-negotiation/issues/104
"""
return _get_doi_csl_item_negotiation(doi, content_negotiation_url_datacite)
return _get_doi_csl_item_negotiation(
doi, content_negotiation_url_datacite, timeout=timeout
)


def get_doi_csl_item_default(doi: str):
def get_doi_csl_item_default(doi: str, timeout: int = 3):
"""
doi.org content negotiation redirects to the content negotiation service of
the Registration Agency, e.g. Crossref or DataCite.
https://github.com/crosscite/content-negotiation/issues/104
"""
return _get_doi_csl_item_negotiation(doi, content_negotiation_url_default)
return _get_doi_csl_item_negotiation(
doi, content_negotiation_url_default, timeout=timeout
)


def get_doi_csl_item_zotero(doi: str):
def get_doi_csl_item_zotero(doi: str, timeout: int = 3):
"""
Generate CSL JSON Data for a DOI using Zotero's translation-server.
"""
from manubot.cite.zotero import get_csl_item

return get_csl_item(f"doi:{doi}")
return get_csl_item(f"doi:{doi}", timeout=timeout)


def get_doi_csl_item_url(doi: str):
def get_doi_csl_item_url(doi: str, timeout: int = 3):
"""
Generate CSL JSON Data for a DOI using Zotero's translation-server.
This function converts the DOI to a URL that presumably resolves to the publisher's site.
Zotero resolves and scrapes data from the resulting webpage.
"""
from manubot.cite.url import get_url_csl_item_zotero

return get_url_csl_item_zotero(f"https://doi.org/{doi}")
return get_url_csl_item_zotero(f"https://doi.org/{doi}", timeout=timeout)


def augment_get_doi_csl_item(function: Callable[..., Any]):
"""
Decorator providing edits to the csl_item returned by a get_doi_csl_item_* function.
"""

def wrapper(doi: str):
def wrapper(doi: str, timeout: int = 3):
doi = doi.lower()
csl_item = function(doi)
csl_item["DOI"] = doi
csl_item["URL"] = f"https://doi.org/{doi}"
short_doi_url = get_short_doi_url(doi)
short_doi_url = get_short_doi_url(doi, timeout=timeout)
if short_doi_url:
csl_item["URL"] = short_doi_url
try:
csl_item.update(get_pubmed_ids_for_doi(doi))
csl_item.update(get_pubmed_ids_for_doi(doi, timeout=timeout))
except Exception:
logging.warning(
f"Error calling get_pubmed_ids_for_doi for {doi}", exc_info=True
Expand All @@ -208,7 +214,7 @@ def wrapper(doi: str):


@augment_get_doi_csl_item
def get_doi_csl_item(doi: str):
def get_doi_csl_item(doi: str, timeout: int = 3):
"""
Generate CSL JSON Data for an DOI.
Expand All @@ -220,7 +226,7 @@ def get_doi_csl_item(doi: str):
# FIXME: this function is repetitive with other get_*_csl_item functions.
for retriever in doi_retrievers:
try:
return retriever(doi)
return retriever(doi, timeout=timeout)
except Exception as error:
logging.warning(
f"Error in {retriever.__name__} for {doi} "
Expand Down
10 changes: 5 additions & 5 deletions manubot/cite/pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def extract_publication_date_parts(article: ElementTree.Element) -> List[int]:
return date_parts


def get_pmcid_and_pmid_for_doi(doi: str) -> Dict[str, str]:
def get_pmcid_and_pmid_for_doi(doi: str, timeout: int = 3) -> Dict[str, str]:
"""
Query PMC's ID Converter API to retrieve the PMCID and PMID for a DOI.
Does not work for DOIs that are in Pubmed but not PubMed Central.
Expand All @@ -292,7 +292,7 @@ def get_pmcid_and_pmid_for_doi(doi: str) -> Dict[str, str]:
assert doi.startswith("10.")
params = {"ids": doi, "tool": "manubot"}
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
response = requests.get(url, params)
response = requests.get(url, params, timeout=timeout)
if not response.ok:
logging.warning(f"Status code {response.status_code} querying {response.url}\n")
return {}
Expand Down Expand Up @@ -358,14 +358,14 @@ def get_pmid_for_doi(doi: str) -> Optional[str]:
return id_elem.text


def get_pubmed_ids_for_doi(doi: str) -> Dict[str, str]:
def get_pubmed_ids_for_doi(doi: str, timeout: int = 3) -> Dict[str, str]:
"""
Return a dictionary with PMCID and PMID, if they exist, for the specified
DOI. See https://github.com/manubot/manubot/issues/45.
"""
pubmed_ids = get_pmcid_and_pmid_for_doi(doi)
pubmed_ids = get_pmcid_and_pmid_for_doi(doi, timeout=timeout)
if not pubmed_ids:
pmid = get_pmid_for_doi(doi)
pmid = get_pmid_for_doi(doi, timeout=timeout)
if pmid:
pubmed_ids["PMID"] = pmid
return pubmed_ids
Expand Down
21 changes: 12 additions & 9 deletions manubot/cite/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ def standardize_prefix_accession(self, accession):
accession = f"{self.prefix_lower}:{accession}"
return self.standard_prefix, accession

def get_csl_item(self, citekey):
return get_url_csl_item(citekey.standard_accession)
def get_csl_item(self, citekey, timeout=3):
return get_url_csl_item(citekey.standard_accession, timeout=timeout)


def get_url_csl_item(url: str) -> CSLItem:
def get_url_csl_item(url: str, timeout: int = 3) -> CSLItem:
"""
Get csl_item for a URL trying a sequence of strategies.
Expand All @@ -38,7 +38,7 @@ def get_url_csl_item(url: str) -> CSLItem:
"""
for retriever in url_retrievers:
try:
return retriever(url)
return retriever(url, timeout=timeout)
except Exception as error:
logging.warning(
f"Error in {retriever.__name__} for {url} "
Expand All @@ -48,22 +48,22 @@ def get_url_csl_item(url: str) -> CSLItem:
raise Exception(f"all get_url_csl_item methods failed for {url}")


def get_url_csl_item_zotero(url: str) -> CSLItem:
def get_url_csl_item_zotero(url: str, timeout: int = 3) -> CSLItem:
"""
Use Zotero's translation-server to generate a CSL Item for the specified URL.
"""
from manubot.cite.zotero import export_as_csl, web_query

zotero_data = web_query(url)
csl_data = export_as_csl(zotero_data)
csl_data = export_as_csl(zotero_data, timeout=timeout)
(csl_item,) = csl_data
if not csl_item.get("URL"):
# some Zotero translators don't set URL. https://github.com/manubot/manubot/issues/244
csl_item["URL"] = url
return csl_item


def get_url_csl_item_greycite(url: str) -> CSLItem:
def get_url_csl_item_greycite(url: str, timeout: int = 3) -> CSLItem:
"""
Uses Greycite which has experiened uptime problems in the past.
API calls seem to take at least 15 seconds. Browser requests are much
Expand All @@ -85,7 +85,10 @@ def get_url_csl_item_greycite(url: str) -> CSLItem:
"User-Agent": get_manubot_user_agent(),
}
response = requests.get(
"http://greycite.knowledgeblog.org/json", params={"uri": url}, headers=headers
"http://greycite.knowledgeblog.org/json",
params={"uri": url},
headers=headers,
timeout=timeout,
)
response.raise_for_status()
# Some Greycite responses were valid JSON besides for an error appended
Expand All @@ -97,7 +100,7 @@ def get_url_csl_item_greycite(url: str) -> CSLItem:
return csl_item


def get_url_csl_item_manual(url: str) -> CSLItem:
def get_url_csl_item_manual(url: str, timeout: int = 3) -> CSLItem:
"""
Manually create csl_item for a URL.
"""
Expand Down
18 changes: 12 additions & 6 deletions manubot/cite/zotero.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def web_query(url: str, timeout=3) -> ZoteroData:
headers = {"User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain"}
params = {"single": 1}
api_url = f"{base_url}/web"
response = requests.post(api_url, params=params, headers=headers, data=str(url), timeout=timeout)
response = requests.post(
api_url, params=params, headers=headers, data=str(url), timeout=timeout
)
try:
zotero_data = response.json()
except Exception as error:
Expand Down Expand Up @@ -66,7 +68,9 @@ def search_query(identifier: str, timeout=3) -> ZoteroData:
"""
api_url = f"{base_url}/search"
headers = {"User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain"}
response = requests.post(api_url, headers=headers, data=str(identifier), timeout=timeout)
response = requests.post(
api_url, headers=headers, data=str(identifier), timeout=timeout
)
try:
zotero_data = response.json()
except Exception as error:
Expand Down Expand Up @@ -107,7 +111,9 @@ def export_as_csl(zotero_data: ZoteroData, timeout=3) -> CSLItems:
api_url = f"{base_url}/export"
params = {"format": "csljson"}
headers = {"User-Agent": get_manubot_user_agent()}
response = requests.post(api_url, params=params, headers=headers, json=zotero_data, timeout=timeout)
response = requests.post(
api_url, params=params, headers=headers, json=zotero_data, timeout=timeout
)
if not response.ok:
message = f"export_as_csl: translation-server returned status code {response.status_code}"
logging.warning(f"{message} with the following output:\n{response.text}")
Expand All @@ -120,13 +126,13 @@ def export_as_csl(zotero_data: ZoteroData, timeout=3) -> CSLItems:
return csl_items


def get_csl_item(identifier: str) -> CSLItem:
def get_csl_item(identifier: str, timeout=3) -> CSLItem:
"""
Use a translation-server search query followed by an export query
to return a CSL Item (the first & only record of the returned CSL JSON).
"""
zotero_data = search_query(identifier)
csl_items = export_as_csl(zotero_data)
zotero_data = search_query(identifier, timeout=timeout)
csl_items = export_as_csl(zotero_data, timeout=timeout)
(csl_item,) = csl_items
return csl_item

Expand Down

0 comments on commit 20d6012

Please sign in to comment.