diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..be88134 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,27 @@ +from unpywall.__main__ import main +from requests.exceptions import HTTPError +import pytest + + +class TestUnpywallCli: + + def test_main(self): + with pytest.raises(SystemExit) as pytest_raise_system_exit: + main(test_args=(['-h'])) + + assert pytest_raise_system_exit.value.code == 0 + + with pytest.raises(SystemExit) as pytest_raise_system_exit: + main(test_args=(['this is a bad argument'])) + + assert pytest_raise_system_exit.value.code == 1 + + def test_view(self): + with pytest.raises(SystemExit) as pytest_raise_system_exit: + main(test_args=(['view'])) + + assert pytest_raise_system_exit.value.code == 2 + + with pytest.raises(HTTPError): + bad_doi = 'this is a bad doi' + main(test_args=(['view', bad_doi])) diff --git a/unpywall/__init__.py b/unpywall/__init__.py index 0c89870..a2ae086 100644 --- a/unpywall/__init__.py +++ b/unpywall/__init__.py @@ -1,8 +1,12 @@ -import urllib.request +import requests import pandas as pd -import json -import time import sys +import subprocess +import tempfile +import webbrowser +import os +import platform +from io import BytesIO class Unpywall: @@ -10,12 +14,6 @@ class Unpywall: Base class that contains useful functions for retrieving information from the Unpaywall REST API (https://api.unpaywall.org). This client uses version 2 of the API. - - Methods - ------- - get_df(dois, progress, errors) - Retrieves information from the Unpaywall API service and returns a - pandas DataFrame. """ api_limit: int = 100000 @@ -73,7 +71,6 @@ def _progress(progress: float) -> None: int(progress * 100)) print(text, end='\r', flush=False, file=sys.stdout) - time.sleep(0.1) if progress == 1: print('\n', file=sys.stdout) @@ -82,6 +79,7 @@ def _progress(progress: float) -> None: def get_df(dois: list, progress: bool = False, errors: str = 'raise', + force: bool = False, ignore_cache: bool = True) -> pd.DataFrame: """ Parses information from the Unpaywall API service and returns it as @@ -96,6 +94,10 @@ def get_df(dois: list, errors : str Either 'raise' or 'ignore'. If the parameter errors is set to 'ignore' than errors will not raise an exception. + force : bool + Whether to force the cache to retrieve a new entry. + ignore_cache : bool + Whether to use or ignore the cache. Returns ------- @@ -107,8 +109,6 @@ def get_df(dois: list, ------ ValueError If the parameter errors contains a faulty value. - AttributeError - If the Unpaywall API did not respond with json. """ dois = Unpywall._validate_dois(dois) @@ -124,31 +124,28 @@ def get_df(dois: list, if progress: Unpywall._progress(n/len(dois)) - try: - r = Unpywall.get_json(doi, - errors=errors, - ignore_cache=ignore_cache) + r = Unpywall.get_json(doi, + errors=errors, + force=force, + ignore_cache=ignore_cache) - # check if json is not empty due to an faulty DOI - if not bool(r): - continue + # check if json is not empty or None due to an faulty DOI + if not bool(r): + continue - df2 = pd.json_normalize(data=r, max_level=1, errors=errors) + df2 = pd.json_normalize(data=r, max_level=1, errors=errors) - df = df.append(df2) + df = df.append(df2) - except (AttributeError, json.decoder.JSONDecodeError): - - if errors == 'raise': - raise AttributeError('Unpaywall API did not return json') - else: - continue + if df.empty: + return None return df @staticmethod def get_json(doi: str, errors: str = 'raise', + force: bool = False, ignore_cache: bool = False): """ This function returns all information in Unpaywall about the given DOI. @@ -157,25 +154,40 @@ def get_json(doi: str, ---------- doi : str The DOI of the requested paper. + errors : str + Either 'raise' or 'ignore'. If the parameter errors is set to + 'ignore' than errors will not raise an exception. + force : bool + Whether to force the cache to retrieve a new entry. + ignore_cache : bool + Whether to use or ignore the cache. Returns ------- JSON object A JSON data structure containing all information returned by Unpaywall about the given DOI. + + Raises + ------ + AttributeError + If the Unpaywall API did not respond with json. """ from .cache import cache - r = cache.get(doi, errors, ignore_cache) - if r: + r = cache.get(doi, + errors=errors, + force=force, + ignore_cache=ignore_cache) + try: return r.json() - else: + except AttributeError: return None @staticmethod - def get_pdf_link(doi: str, errors: str = 'raise'): + def get_pdf_link(doi: str): """ - This function returns a link to the an OA pdf (if available). + This function returns a link to an OA pdf (if available). Parameters ---------- @@ -187,14 +199,14 @@ def get_pdf_link(doi: str, errors: str = 'raise'): str The URL of an OA PDF (if available). """ - json_data = Unpywall.get_json(doi, errors=errors) + json_data = Unpywall.get_json(doi) try: return json_data['best_oa_location']['url_for_pdf'] except (KeyError, TypeError): return None @staticmethod - def get_doc_link(doi: str, errors: str = 'raise'): + def get_doc_link(doi: str): """ This function returns a link to the best OA location (not necessarily a PDF). @@ -209,14 +221,14 @@ def get_doc_link(doi: str, errors: str = 'raise'): str The URL of the best OA location (not necessarily a PDF). """ - json_data = Unpywall.get_json(doi, errors) + json_data = Unpywall.get_json(doi) try: return json_data['best_oa_location']['url'] except (KeyError, TypeError): return None @staticmethod - def get_all_links(doi: str, errors: str = 'raise') -> list: + def get_all_links(doi: str) -> list: """ This function returns a list of URLs for all open-access copies listed in Unpaywall. @@ -232,14 +244,14 @@ def get_all_links(doi: str, errors: str = 'raise') -> list: A list of URLs leading to open-access copies. """ data = [] - for value in [Unpywall.get_doc_link(doi, errors), - Unpywall.get_pdf_link(doi, errors)]: + for value in [Unpywall.get_doc_link(doi), + Unpywall.get_pdf_link(doi)]: if value and value not in data: data.append(value) return data @staticmethod - def download_pdf_handle(doi: str, errors: str = 'raise'): + def download_pdf_handle(doi: str): """ This function returns a file-like object containing the requested PDF. @@ -253,5 +265,91 @@ def download_pdf_handle(doi: str, errors: str = 'raise'): object The handle of the PDF file. """ - pdf_link = Unpywall.get_pdf_link(doi, errors) - return urllib.request.urlopen(pdf_link) + pdf_link = Unpywall.get_pdf_link(doi) + r = requests.get(pdf_link) + return BytesIO(bytearray(r.text, encoding='utf-8')) + + @staticmethod + def view_pdf(doi: str, + mode: str = 'viewer', + progress: bool = False) -> None: + """ + This function opens a local copy of a PDF from a given DOI. + + Parameters + ---------- + doi : str + The DOI of the requested paper. + mode : str + The mode for viewing a PDF. + progress : bool + Whether the progress of the API call should be printed out or not. + """ + + url = Unpywall.get_pdf_link(doi) + r = requests.get(url, stream=url) + file_size = int(r.headers.get('content-length', 0)) + block_size = 1024 + + if mode == 'viewer': + + tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') + + with open(tmp.name, 'wb') as file: + chunk_size = 0 + for chunk in r.iter_content(block_size): + if progress: + chunk_size += len(chunk) + Unpywall._progress(chunk_size/file_size) + file.write(chunk) + + # macOS + if platform.system() == 'Darwin': + subprocess.run(['open', tmp.name], check=True) + # Windows + elif platform.system() == 'Windows': + os.startfile(tmp.name) + # Linux + else: + subprocess.run(['xdg-open', tmp.name], check=True) + + else: + webbrowser.open_new(url) + + @staticmethod + def download_pdf_file(doi: str, + filename: str, + filepath: str = '.', + progress: bool = False) -> None: + """ + This function downloads a PDF from a given DOI. + + Parameters + ---------- + doi : str + The DOI of the requested paper. + filename : str + The filename for the PDF. + filepath : str + The path to store the downloaded PDF. + progress : bool + Whether the progress of the API call should be printed out or not. + """ + + url = Unpywall.get_pdf_link(doi) + r = requests.get(url, stream=url) + file_size = int(r.headers.get('content-length', 0)) + block_size = 1024 + + path = os.path.join(filepath, filename) + + if not os.path.exists(filepath): + os.makedirs(filepath) + + with open(path, 'wb') as file: + chunk_size = 0 + for chunk in r.iter_content(block_size): + if progress: + chunk_size += len(chunk) + Unpywall._progress(chunk_size/file_size) + file.write(chunk) diff --git a/unpywall/__main__.py b/unpywall/__main__.py index d9071e5..85acc1c 100644 --- a/unpywall/__main__.py +++ b/unpywall/__main__.py @@ -1,4 +1,5 @@ from argparse import ArgumentParser, RawTextHelpFormatter, SUPPRESS +import textwrap import sys from unpywall import Unpywall @@ -12,48 +13,185 @@ def error(self, message: str) -> None: sys.exit(2) -def main(): - ap = UnpywallArgumentParser(prog='unpywall', - description=('Command-line tool for' - + ' interfacing the Unpaywall' - + ' API'), - formatter_class=RawTextHelpFormatter, - add_help=False) - ap.add_argument('doi', - type=str, - metavar='doi', - help='\tThe DOI to be retrieved.') - ap.add_argument('method', - type=str, - metavar='method', - help='\tThe method you want to use.') - ap.add_argument('-b', - '--backend', - type=str, - default='remote', - dest='backend', - metavar='\b', - help='\tThe backend you want to use.') - ap.add_argument('-e', - '--errors', - type=str, - default='raise', - dest='errors', - metavar='\b', - help='\tThe error behaviour you want to use.') - ap.add_argument('-h', - '--help', - action='help', - default=SUPPRESS, - help='\tShow this help message and exit.') - - args = ap.parse_args() - - doi = args.doi - errors = args.errors - - if args.method == 'get_pdf': - print(Unpywall.get_pdf_link(doi, errors)) +class main: + + def __init__(self, test_args=None) -> None: + self.test_args = test_args + usage = textwrap.dedent("""unpywall [] + + \nCommand-line tool for interfacing the Unpaywall API + """) + + description = textwrap.dedent(""" + These are common unpywall commands: + + view This command opens a local copy of a PDF from + a given DOI. + download This command downloads a copy of a PDF from a + given DOI. + link This command returns a link to an OA pdf + (if available). + """) + ap = UnpywallArgumentParser(prog='unpywall', + usage=usage, + description=description, + formatter_class=RawTextHelpFormatter, + add_help=False) + + ap.add_argument('command', help=SUPPRESS) + ap.add_argument('-h', + '--help', + action='help', + default=SUPPRESS, + help=SUPPRESS) + if self.test_args: + args = ap.parse_args(self.test_args[0:1]) + else: + args = ap.parse_args(sys.argv[1:2]) + if not hasattr(self, args.command): + print('Unknown option: {}'.format(args.command)) + ap.print_help() + sys.exit(1) + getattr(self, args.command)() + + def __repr__(self) -> None: + return None + + def view(self) -> None: + ap = UnpywallArgumentParser(description=('This command opens a local' + + ' copy of a PDF from a' + + ' given DOI.'), + formatter_class=RawTextHelpFormatter, + add_help=False) + ap.add_argument('doi', + type=str, + metavar='doi', + help='\tThe DOI of the document.') + ap.add_argument('-m', + '--mode', + type=str, + default='viewer', + dest='mode', + choices=['viewer', 'browser'], + metavar='\b', + help='\tThe mode for viewing a PDF.') + ap.add_argument('-b', + '--backend', + type=str, + default='remote', + dest='backend', + choices=['remote', 'cache', 'snapshot'], + metavar='\b', + help='\tThe backend you want to use.') + ap.add_argument('-u', + '--progress', + type=bool, + default=False, + dest='progress', + metavar='\b', + help='\tShow progress bar.') + ap.add_argument('-h', + '--help', + action='help', + default=SUPPRESS, + help=SUPPRESS) + + if self.test_args: + args = ap.parse_args(self.test_args[1:]) + else: + args = ap.parse_args(sys.argv[2:]) + + Unpywall.view_pdf(args.doi, args.mode, progress=args.progress) + + def download(self) -> None: + ap = UnpywallArgumentParser(description=('This command downloads a' + + ' copy of a PDF from a' + + ' given DOI.'), + formatter_class=RawTextHelpFormatter, + add_help=False) + ap.add_argument('doi', + type=str, + metavar='doi', + help='\tThe DOI of the document.') + ap.add_argument('-f', + '--filename', + type=str, + dest='filename', + metavar='\b', + required=True, + help='\tThe filename for downloading a PDF.') + ap.add_argument('-p', + '--path', + type=str, + default='.', + dest='filepath', + metavar='\b', + help='\tThe filepath for downloading a PDF.') + ap.add_argument('-b', + '--backend', + type=str, + default='remote', + dest='backend', + choices=['remote', 'cache', 'snapshot'], + metavar='\b', + help='\tThe backend you want to use.') + ap.add_argument('-u', + '--progress', + type=bool, + default=False, + dest='progress', + metavar='\b', + help='\tShow progress bar.') + ap.add_argument('-h', + '--help', + action='help', + default=SUPPRESS, + help=SUPPRESS) + + if self.test_args: + args = ap.parse_args(self.test_args[1:]) + else: + args = ap.parse_args(sys.argv[2:]) + + try: + Unpywall.download_pdf_file(args.doi, + filename=args.filename, + filepath=args.filepath, + progress=args.progress) + print('File was successfully downloaded.') + except Exception: + print('Could not download file.') + + def link(self) -> None: + ap = UnpywallArgumentParser(description=('This command returns a link' + + ' to an OA pdf' + + ' (if available).'), + formatter_class=RawTextHelpFormatter, + add_help=False) + ap.add_argument('doi', + type=str, + metavar='doi', + help='\tThe DOI of the document.') + ap.add_argument('-b', + '--backend', + type=str, + default='remote', + dest='backend', + choices=['remote', 'cache', 'snapshot'], + metavar='\b', + help='\tThe backend you want to use.') + ap.add_argument('-h', + '--help', + action='help', + default=SUPPRESS, + help=SUPPRESS) + + if self.test_args: + args = ap.parse_args(self.test_args[1:]) + else: + args = ap.parse_args(sys.argv[2:]) + + print(Unpywall.get_pdf_link(args.doi)) if __name__ == '__main__': diff --git a/unpywall/cache.py b/unpywall/cache.py index 33da841..dd8e563 100644 --- a/unpywall/cache.py +++ b/unpywall/cache.py @@ -40,7 +40,7 @@ def __init__(self, timeout='never', name=None): try: self.load(self.name) except FileNotFoundError: - print('No cache found') + print('No cache found. A new cache was initialized.') self.reset_cache() self.timeout = timeout @@ -87,7 +87,7 @@ def timed_out(self, doi): is_timed_out = time.time() > self.access_times[doi] + self.timeout return is_timed_out - def get(self, doi, errors='raise', ignore_cache=False): + def get(self, doi, errors='raise', force=False, ignore_cache=False): """ Return the record for the given doi. @@ -97,8 +97,10 @@ def get(self, doi, errors='raise', ignore_cache=False): The DOI to be retrieved. errors : str Whether to ignore or raise errors. - ignore_cache : bool + force : bool Whether to force the cache to retrieve a new entry. + ignore_cache : bool + Whether to use or ignore the cache. Returns ------- @@ -106,15 +108,19 @@ def get(self, doi, errors='raise', ignore_cache=False): The response from Unpaywall. """ record = None - if (doi not in self.content) or self.timed_out(doi) or ignore_cache: - downloaded = self.download(doi, errors) - if downloaded: - self.access_times[doi] = time.time() - self.content[doi] = downloaded - self.save() - record = downloaded + + if not ignore_cache: + if (doi not in self.content) or self.timed_out(doi) or force: + downloaded = self.download(doi, errors) + if downloaded: + self.access_times[doi] = time.time() + self.content[doi] = downloaded + self.save() + record = downloaded + else: + record = deepcopy(self.content[doi]) else: - record = deepcopy(self.content[doi]) + record = self.download(doi, errors) return record def save(self, name=None): @@ -174,6 +180,7 @@ def download(self, doi, errors): r.raise_for_status() return r + # if DOI is invalid except requests.exceptions.HTTPError as HTTPError: if errors == 'raise': raise HTTPError @@ -182,10 +189,12 @@ def download(self, doi, errors): if errors == 'raise': raise RequestException + # if bad internet connection except requests.exceptions.ConnectionError as ConnectionError: if errors == 'raise': raise ConnectionError + # server is down except requests.exceptions.Timeout as Timeout: if errors == 'raise': raise Timeout