Skip to content

Commit

Permalink
dump DOI output to file
Browse files Browse the repository at this point in the history
  • Loading branch information
tclose committed Aug 12, 2020
1 parent 1961ce0 commit 34baaf2
Showing 1 changed file with 40 additions and 8 deletions.
48 changes: 40 additions & 8 deletions scripts/find_authors.py
@@ -1,4 +1,9 @@
import sys
"""
Script to download full text articles linked to facility researchers and
search for references to NIF-related instruments
"""
import os.path as op
from argparse import ArgumentParser
from urllib.parse import unquote as unquote_url
import requests
from requests.exceptions import ConnectionError
Expand Down Expand Up @@ -53,6 +58,14 @@ def text_from_pii(pii):
'BIOC',
'PSYC']

parser = ArgumentParser(__doc__)
parser.add_argument('--full_text_dir', default=None,
help="Directory to dump full text outputs")
args = parser.parse_args()

if args.full_text_dir:
os.makedirs(args.full_text_dir, exist_ok=True)

publications = []

for first, last, initials in AUTHORS:
Expand Down Expand Up @@ -87,14 +100,33 @@ def text_from_pii(pii):
len([p for p in publications if p.doi])))

for pub in publications:
print('Title: {} | PII: {} | DOI: {}\n'.format(pub.title, pub.pii, pub.doi))

full_text = None
pii_text = None
if pub.pii:
full_text = text_from_pii(pub.pii)
elif pub.doi:
if full_text is None:
status = 'Could not download from PII'
else:
status = 'Text downloaded from PII'
if pub.doi:
if full_text is not None:
pii_text = full_text
full_text = text_from_doi(pub.doi)
if full_text is None:
full_text = "Could not access DOI ({})".format(pub.doi)
else:
full_text = "Could not find DOI or PII for title!!!"

print(str(full_text) + '\n\n=============================================\n\n')
status = "Could not access DOI"
else:
status = "Downloaded from DOI"
elif not pii_text:
status = "Could not find DOI or PII for title!!!"

if full_text and args.full_text_dir:
with open(op.join(args.full_text_dir, pub.title[:100]) + '.html') as f:
f.write(full_text)
if pii_text:
with open(op.join(args.full_text_dir, pub.title[:100])
+ '.txt') as f:
f.write(pii_text)

print('Title: {} | PII: {} | DOI: {} | Status: {}\n'
.format(pub.title, pub.pii, pub.doi, status))

0 comments on commit 34baaf2

Please sign in to comment.