Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

command-line-interface #27

Merged
merged 4 commits into from
Apr 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from unpywall.__main__ import main
from requests.exceptions import HTTPError
import pytest


class TestUnpywallCli:

def test_main(self):
with pytest.raises(SystemExit) as pytest_raise_system_exit:
main(test_args=(['-h']))

assert pytest_raise_system_exit.value.code == 0

with pytest.raises(SystemExit) as pytest_raise_system_exit:
main(test_args=(['this is a bad argument']))

assert pytest_raise_system_exit.value.code == 1

def test_view(self):
with pytest.raises(SystemExit) as pytest_raise_system_exit:
main(test_args=(['view']))

assert pytest_raise_system_exit.value.code == 2

with pytest.raises(HTTPError):
bad_doi = 'this is a bad doi'
main(test_args=(['view', bad_doi]))
180 changes: 139 additions & 41 deletions unpywall/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
import urllib.request
import requests
import pandas as pd
import json
import time
import sys
import subprocess
import tempfile
import webbrowser
import os
import platform
from io import BytesIO


class Unpywall:
"""
Base class that contains useful functions for retrieving information
from the Unpaywall REST API (https://api.unpaywall.org). This client uses
version 2 of the API.

Methods
-------
get_df(dois, progress, errors)
Retrieves information from the Unpaywall API service and returns a
pandas DataFrame.
"""

api_limit: int = 100000
Expand Down Expand Up @@ -73,7 +71,6 @@ def _progress(progress: float) -> None:
int(progress * 100))

print(text, end='\r', flush=False, file=sys.stdout)
time.sleep(0.1)

if progress == 1:
print('\n', file=sys.stdout)
Expand All @@ -82,6 +79,7 @@ def _progress(progress: float) -> None:
def get_df(dois: list,
progress: bool = False,
errors: str = 'raise',
force: bool = False,
ignore_cache: bool = True) -> pd.DataFrame:
"""
Parses information from the Unpaywall API service and returns it as
Expand All @@ -96,6 +94,10 @@ def get_df(dois: list,
errors : str
Either 'raise' or 'ignore'. If the parameter errors is set to
'ignore' than errors will not raise an exception.
force : bool
Whether to force the cache to retrieve a new entry.
ignore_cache : bool
Whether to use or ignore the cache.

Returns
-------
Expand All @@ -107,8 +109,6 @@ def get_df(dois: list,
------
ValueError
If the parameter errors contains a faulty value.
AttributeError
If the Unpaywall API did not respond with json.
"""

dois = Unpywall._validate_dois(dois)
Expand All @@ -124,31 +124,28 @@ def get_df(dois: list,
if progress:
Unpywall._progress(n/len(dois))

try:
r = Unpywall.get_json(doi,
errors=errors,
ignore_cache=ignore_cache)
r = Unpywall.get_json(doi,
errors=errors,
force=force,
ignore_cache=ignore_cache)

# check if json is not empty due to an faulty DOI
if not bool(r):
continue
# check if json is not empty or None due to an faulty DOI
if not bool(r):
continue

df2 = pd.json_normalize(data=r, max_level=1, errors=errors)
df2 = pd.json_normalize(data=r, max_level=1, errors=errors)

df = df.append(df2)
df = df.append(df2)

except (AttributeError, json.decoder.JSONDecodeError):

if errors == 'raise':
raise AttributeError('Unpaywall API did not return json')
else:
continue
if df.empty:
return None

return df

@staticmethod
def get_json(doi: str,
errors: str = 'raise',
force: bool = False,
ignore_cache: bool = False):
"""
This function returns all information in Unpaywall about the given DOI.
Expand All @@ -157,25 +154,40 @@ def get_json(doi: str,
----------
doi : str
The DOI of the requested paper.
errors : str
Either 'raise' or 'ignore'. If the parameter errors is set to
'ignore' than errors will not raise an exception.
force : bool
Whether to force the cache to retrieve a new entry.
ignore_cache : bool
Whether to use or ignore the cache.

Returns
-------
JSON object
A JSON data structure containing all information
returned by Unpaywall about the given DOI.

Raises
------
AttributeError
If the Unpaywall API did not respond with json.
"""
from .cache import cache

r = cache.get(doi, errors, ignore_cache)
if r:
r = cache.get(doi,
errors=errors,
force=force,
ignore_cache=ignore_cache)
try:
return r.json()
else:
except AttributeError:
return None

@staticmethod
def get_pdf_link(doi: str, errors: str = 'raise'):
def get_pdf_link(doi: str):
"""
This function returns a link to the an OA pdf (if available).
This function returns a link to an OA pdf (if available).

Parameters
----------
Expand All @@ -187,14 +199,14 @@ def get_pdf_link(doi: str, errors: str = 'raise'):
str
The URL of an OA PDF (if available).
"""
json_data = Unpywall.get_json(doi, errors=errors)
json_data = Unpywall.get_json(doi)
try:
return json_data['best_oa_location']['url_for_pdf']
except (KeyError, TypeError):
return None

@staticmethod
def get_doc_link(doi: str, errors: str = 'raise'):
def get_doc_link(doi: str):
"""
This function returns a link to the best OA location
(not necessarily a PDF).
Expand All @@ -209,14 +221,14 @@ def get_doc_link(doi: str, errors: str = 'raise'):
str
The URL of the best OA location (not necessarily a PDF).
"""
json_data = Unpywall.get_json(doi, errors)
json_data = Unpywall.get_json(doi)
try:
return json_data['best_oa_location']['url']
except (KeyError, TypeError):
return None

@staticmethod
def get_all_links(doi: str, errors: str = 'raise') -> list:
def get_all_links(doi: str) -> list:
"""
This function returns a list of URLs for all open-access copies
listed in Unpaywall.
Expand All @@ -232,14 +244,14 @@ def get_all_links(doi: str, errors: str = 'raise') -> list:
A list of URLs leading to open-access copies.
"""
data = []
for value in [Unpywall.get_doc_link(doi, errors),
Unpywall.get_pdf_link(doi, errors)]:
for value in [Unpywall.get_doc_link(doi),
Unpywall.get_pdf_link(doi)]:
if value and value not in data:
data.append(value)
return data

@staticmethod
def download_pdf_handle(doi: str, errors: str = 'raise'):
def download_pdf_handle(doi: str):
"""
This function returns a file-like object containing the requested PDF.

Expand All @@ -253,5 +265,91 @@ def download_pdf_handle(doi: str, errors: str = 'raise'):
object
The handle of the PDF file.
"""
pdf_link = Unpywall.get_pdf_link(doi, errors)
return urllib.request.urlopen(pdf_link)
pdf_link = Unpywall.get_pdf_link(doi)
r = requests.get(pdf_link)
return BytesIO(bytearray(r.text, encoding='utf-8'))

@staticmethod
def view_pdf(doi: str,
mode: str = 'viewer',
progress: bool = False) -> None:
"""
This function opens a local copy of a PDF from a given DOI.

Parameters
----------
doi : str
The DOI of the requested paper.
mode : str
The mode for viewing a PDF.
progress : bool
Whether the progress of the API call should be printed out or not.
"""

url = Unpywall.get_pdf_link(doi)
r = requests.get(url, stream=url)
file_size = int(r.headers.get('content-length', 0))
block_size = 1024

if mode == 'viewer':

tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')

with open(tmp.name, 'wb') as file:
chunk_size = 0
for chunk in r.iter_content(block_size):
if progress:
chunk_size += len(chunk)
Unpywall._progress(chunk_size/file_size)
file.write(chunk)

# macOS
if platform.system() == 'Darwin':
subprocess.run(['open', tmp.name], check=True)
# Windows
elif platform.system() == 'Windows':
os.startfile(tmp.name)
# Linux
else:
subprocess.run(['xdg-open', tmp.name], check=True)

else:
webbrowser.open_new(url)

@staticmethod
def download_pdf_file(doi: str,
filename: str,
filepath: str = '.',
progress: bool = False) -> None:
"""
This function downloads a PDF from a given DOI.

Parameters
----------
doi : str
The DOI of the requested paper.
filename : str
The filename for the PDF.
filepath : str
The path to store the downloaded PDF.
progress : bool
Whether the progress of the API call should be printed out or not.
"""

url = Unpywall.get_pdf_link(doi)
r = requests.get(url, stream=url)
file_size = int(r.headers.get('content-length', 0))
block_size = 1024

path = os.path.join(filepath, filename)

if not os.path.exists(filepath):
os.makedirs(filepath)

with open(path, 'wb') as file:
chunk_size = 0
for chunk in r.iter_content(block_size):
if progress:
chunk_size += len(chunk)
Unpywall._progress(chunk_size/file_size)
file.write(chunk)
Loading