Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

command-line-interface #27

Merged
merged 4 commits into from
Apr 20, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
133 changes: 103 additions & 30 deletions unpywall/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import urllib.request
import requests
import pandas as pd
import json
import time
import sys
import subprocess
import tempfile
import webbrowser
import os


class Unpywall:
Expand Down Expand Up @@ -82,6 +86,7 @@ def _progress(progress: float) -> None:
def get_df(dois: list,
progress: bool = False,
errors: str = 'raise',
force: bool = False,
ignore_cache: bool = True) -> pd.DataFrame:
"""
Parses information from the Unpaywall API service and returns it as
Expand All @@ -96,6 +101,10 @@ def get_df(dois: list,
errors : str
Either 'raise' or 'ignore'. If the parameter errors is set to
'ignore' than errors will not raise an exception.
force : bool
Whether to force the cache to retrieve a new entry.
ignore_cache : bool
Whether to use or ignore the cache.

Returns
-------
Expand All @@ -107,8 +116,6 @@ def get_df(dois: list,
------
ValueError
If the parameter errors contains a faulty value.
AttributeError
If the Unpaywall API did not respond with json.
"""

dois = Unpywall._validate_dois(dois)
Expand All @@ -124,31 +131,28 @@ def get_df(dois: list,
if progress:
Unpywall._progress(n/len(dois))

try:
r = Unpywall.get_json(doi,
errors=errors,
ignore_cache=ignore_cache)

# check if json is not empty due to an faulty DOI
if not bool(r):
continue
r = Unpywall.get_json(doi,
errors=errors,
force=force,
ignore_cache=ignore_cache)

df2 = pd.json_normalize(data=r, max_level=1, errors=errors)
# check if json is not empty or None due to an faulty DOI
if not bool(r):
continue

df = df.append(df2)
df2 = pd.json_normalize(data=r, max_level=1, errors=errors)

except (AttributeError, json.decoder.JSONDecodeError):
df = df.append(df2)

if errors == 'raise':
raise AttributeError('Unpaywall API did not return json')
else:
continue
if df.empty:
return None

return df

@staticmethod
def get_json(doi: str,
errors: str = 'raise',
force: bool = False,
ignore_cache: bool = False):
"""
This function returns all information in Unpaywall about the given DOI.
Expand All @@ -157,23 +161,38 @@ def get_json(doi: str,
----------
doi : str
The DOI of the requested paper.
errors : str
Either 'raise' or 'ignore'. If the parameter errors is set to
'ignore' than errors will not raise an exception.
force : bool
Whether to force the cache to retrieve a new entry.
ignore_cache : bool
Whether to use or ignore the cache.

Returns
-------
JSON object
A JSON data structure containing all information
returned by Unpaywall about the given DOI.

Raises
------
AttributeError
If the Unpaywall API did not respond with json.
"""
from .cache import cache

r = cache.get(doi, errors, ignore_cache)
if r:
r = cache.get(doi,
errors=errors,
force=force,
ignore_cache=ignore_cache)
try:
return r.json()
else:
except AttributeError:
return None

@staticmethod
def get_pdf_link(doi: str, errors: str = 'raise'):
def get_pdf_link(doi: str):
"""
This function returns a link to the an OA pdf (if available).

Expand All @@ -187,14 +206,14 @@ def get_pdf_link(doi: str, errors: str = 'raise'):
str
The URL of an OA PDF (if available).
"""
json_data = Unpywall.get_json(doi, errors=errors)
json_data = Unpywall.get_json(doi)
try:
return json_data['best_oa_location']['url_for_pdf']
except (KeyError, TypeError):
return None

@staticmethod
def get_doc_link(doi: str, errors: str = 'raise'):
def get_doc_link(doi: str):
"""
This function returns a link to the best OA location
(not necessarily a PDF).
Expand All @@ -209,14 +228,14 @@ def get_doc_link(doi: str, errors: str = 'raise'):
str
The URL of the best OA location (not necessarily a PDF).
"""
json_data = Unpywall.get_json(doi, errors)
json_data = Unpywall.get_json(doi)
try:
return json_data['best_oa_location']['url']
except (KeyError, TypeError):
return None

@staticmethod
def get_all_links(doi: str, errors: str = 'raise') -> list:
def get_all_links(doi: str) -> list:
"""
This function returns a list of URLs for all open-access copies
listed in Unpaywall.
Expand All @@ -232,14 +251,14 @@ def get_all_links(doi: str, errors: str = 'raise') -> list:
A list of URLs leading to open-access copies.
"""
data = []
for value in [Unpywall.get_doc_link(doi, errors),
Unpywall.get_pdf_link(doi, errors)]:
for value in [Unpywall.get_doc_link(doi),
Unpywall.get_pdf_link(doi)]:
if value and value not in data:
data.append(value)
return data

@staticmethod
def download_pdf_handle(doi: str, errors: str = 'raise'):
def download_pdf_handle(doi: str):
"""
This function returns a file-like object containing the requested PDF.

Expand All @@ -253,5 +272,59 @@ def download_pdf_handle(doi: str, errors: str = 'raise'):
object
The handle of the PDF file.
"""
pdf_link = Unpywall.get_pdf_link(doi, errors)
pdf_link = Unpywall.get_pdf_link(doi)
return urllib.request.urlopen(pdf_link)

@staticmethod
def view_pdf(doi: str, mode: str = 'viewer') -> None:
"""
This function opens a local copy of a PDF from a given DOI.

Parameters
----------
doi : str
The DOI of the requested paper.
mode : str
The mode for viewing a PDF.
"""

url = Unpywall.get_pdf_link(doi)
r = requests.get(url, stream=url)

if mode == 'viewer':

tmp = tempfile.NamedTemporaryFile(delete=False)

with open(tmp.name, 'wb') as file:
file.write(r.content)

subprocess.run(['open', tmp.name], check=True)

else:
webbrowser.open_new(url)

@staticmethod
def download_pdf_file(doi: str, filename: str, filepath: str) -> None:
"""
This function downloads a PDF from a given DOI.

Parameters
----------
doi : str
The DOI of the requested paper.
filename : str
The filename for the PDF.
filepath : str
The path to store the downloaded PDF.
"""

url = Unpywall.get_pdf_link(doi)
r = requests.get(url, stream=url)

path = os.path.join(filepath, filename)

if not os.path.exists(filepath):
os.makedirs(filepath)

with open(path, 'wb') as file:
file.write(r.content)
38 changes: 35 additions & 3 deletions unpywall/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,44 @@ def main():
ap.add_argument('method',
type=str,
metavar='method',
choices=['pdf_link', 'download_pdf', 'view_pdf'],
help='\tThe method you want to use.')
ap.add_argument('-b',
'--backend',
type=str,
default='remote',
dest='backend',
choices=['remote', 'cache', 'snapshot'],
metavar='\b',
help='\tThe backend you want to use.')
ap.add_argument('-e',
'--errors',
type=str,
default='raise',
dest='errors',
choices=['raise', 'ignore'],
metavar='\b',
help='\tThe error behaviour you want to use.')
ap.add_argument('-f',
'--filename',
type=str,
dest='filename',
metavar='\b',
help='\tThe filename for downloading a PDF.')
ap.add_argument('-m',
'--mode',
type=str,
default='viewer',
dest='mode',
choices=['viewer', 'browser'],
metavar='\b',
help='\tThe mode for viewing a PDF.')
ap.add_argument('-p',
'--path',
type=str,
dest='filepath',
metavar='\b',
help='\tThe filepath for downloading a PDF.')
ap.add_argument('-h',
'--help',
action='help',
Expand All @@ -50,10 +73,19 @@ def main():
args = ap.parse_args()

doi = args.doi
errors = args.errors

if args.method == 'get_pdf':
print(Unpywall.get_pdf_link(doi, errors))
if args.method == 'pdf_link':
print(Unpywall.get_pdf_link(doi))

if args.method == 'download_pdf':
try:
Unpywall.download_pdf_file(doi, args.filename, args.filepath)
print('File was successfully downloaded.')
except Exception:
print('Could not download file.')

if args.method == 'view_pdf':
Unpywall.view_pdf(doi, mode=args.mode)


if __name__ == '__main__':
Expand Down
31 changes: 20 additions & 11 deletions unpywall/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, timeout='never', name=None):
try:
self.load(self.name)
except FileNotFoundError:
print('No cache found')
warnings.warn('No cache found. A new cache was initialized.')
self.reset_cache()
self.timeout = timeout

Expand Down Expand Up @@ -87,7 +87,7 @@ def timed_out(self, doi):
is_timed_out = time.time() > self.access_times[doi] + self.timeout
return is_timed_out

def get(self, doi, errors='raise', ignore_cache=False):
def get(self, doi, errors='raise', force=False, ignore_cache=False):
"""
Return the record for the given doi.

Expand All @@ -97,24 +97,30 @@ def get(self, doi, errors='raise', ignore_cache=False):
The DOI to be retrieved.
errors : str
Whether to ignore or raise errors.
ignore_cache : bool
force : bool
Whether to force the cache to retrieve a new entry.
ignore_cache : bool
Whether to use or ignore the cache.

Returns
-------
record : requests.Response
The response from Unpaywall.
"""
record = None
if (doi not in self.content) or self.timed_out(doi) or ignore_cache:
downloaded = self.download(doi, errors)
if downloaded:
self.access_times[doi] = time.time()
self.content[doi] = downloaded
self.save()
record = downloaded

if not ignore_cache:
if (doi not in self.content) or self.timed_out(doi) or force:
downloaded = self.download(doi, errors)
if downloaded:
self.access_times[doi] = time.time()
self.content[doi] = downloaded
self.save()
record = downloaded
else:
record = deepcopy(self.content[doi])
else:
record = deepcopy(self.content[doi])
record = self.download(doi, errors)
return record

def save(self, name=None):
Expand Down Expand Up @@ -174,6 +180,7 @@ def download(self, doi, errors):
r.raise_for_status()
return r

# if DOI is invalid
except requests.exceptions.HTTPError as HTTPError:
if errors == 'raise':
raise HTTPError
Expand All @@ -182,10 +189,12 @@ def download(self, doi, errors):
if errors == 'raise':
raise RequestException

# if bad internet connection
except requests.exceptions.ConnectionError as ConnectionError:
if errors == 'raise':
raise ConnectionError

# server is down
except requests.exceptions.Timeout as Timeout:
if errors == 'raise':
raise Timeout
Expand Down