/
add_content.py
95 lines (82 loc) · 2.9 KB
/
add_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Script to download full text articles for publications so they can be searced
for key terms
"""
import os.path
import json
import io
from urllib.parse import unquote as unquote_url
import requests
from requests.exceptions import ConnectionError
from PyPDF2 import PdfFileReader
from bs4 import BeautifulSoup
import pybliometrics.scopus as sc
from app import db
from app.models import Publication
from app.constants import (
CANT_ACCESS_CONTENT, PLAIN_TEXT_ACCESS_CONTENT, HTML_ACCESS_CONTENT,
UNKNOWN_ACCESS_CONTENT) # PDF_ACCESS_CONTENT,
DOI_RESOLVER = 'http://doi.org/'
SCIENCE_DIRECT = 'http://api.elsevier.com/content/article/pii/'
CROSSREF = 'https://api.wiley.com/onlinelibrary/tdm/v1/articles/'
crossref_config = os.path.join(os.environ['HOME'], '.crossref', 'config.json')
if os.path.exists(crossref_config):
with open(crossref_config) as f:
crossref_token = json.load(f)['APIToken']
def content_from_doi(doi):
try:
response = requests.get(DOI_RESOLVER + doi)
except ConnectionError:
return None
html = BeautifulSoup(response.text, features='lxml')
redirect_tag = html.find(id='redirectURL')
if redirect_tag:
redirect_url = unquote_url(redirect_tag.attrs['value'])
html = BeautifulSoup(requests.get(redirect_url).text,
features='lxml')
if html.find('title').text.startswith('Attention Required!'):
return None
# return content_from_crossref(doi)
return html
def content_from_crossref(doi):
response = requests.get(
CROSSREF + doi,
headers={"CR-Clickthrough-Client-Token" : crossref_token,
"Accept" : 'application/pdf'})
pdf = PdfFileReader(io.BytesIO(response.content))
text = ''
for page in pdf.pages:
text += page.extractText()
return text
def content_from_pii(pii):
response = requests.get(
SCIENCE_DIRECT + pii,
headers={"X-ELS-APIKey" : sc.config['Authentication']['APIKey'],
"Accept" : 'application/json'})
text = None
if response.ok:
try:
text = response.json()[
'full-text-retrieval-response']['originalText']
except KeyError:
pass
return text
for pub in Publication.query.all():
if not pub.has_content:
if pub.pii:
content = content_from_pii(pub.pii)
if content is None:
pub.access_status = CANT_ACCESS_CONTENT
else:
pub.access_status = PLAIN_TEXT_ACCESS_CONTENT
elif pub.doi:
content = content_from_doi(pub.doi)
if content is None:
pub.access_status = CANT_ACCESS_CONTENT
else:
pub.access_status = HTML_ACCESS_CONTENT
else:
pub.access_status = UNKNOWN_ACCESS_CONTENT
if content:
pub.content = content
db.session.commit()