In [None]:
import os
import requests
import json
from bs4 import BeautifulSoup as Soup
import pandas as pd

In [None]:
%run storage.ipynb

In [None]:
def scopus_search(query):
    cursor = "*"
    entries = []
    while cursor:
        response = requests.get("https://api.elsevier.com/content/search/scopus?query=TITLE-ABS-KEY({})&apiKey={}&cursor={}&count=200"
                                .format(query, ELSEVIER_APIKey, cursor))
        cont = json.loads(response.content)
        if cont["search-results"]["cursor"]["@next"] != cursor:
            cursor = cont["search-results"]["cursor"]["@next"].replace('+','%2B').replace('/', '%2F')
        else:
            cursor = None
        if "entry" not in cont["search-results"]:
            break
        for e in cont["search-results"]["entry"]:
            entries.append(e)
        print('{}/{}'.format(len(entries), cont["search-results"]['opensearch:totalResults']))
    return entries


def scopus_get_article(aid, url):
    filename = os.path.join('xml', aid + '.xml')
    article = storage_download(filename)
    if article is None:
        print('Fetching content from scopus for: {}'.format(aid))
        article = requests.get("{}?apiKey={}".format(url, ELSEVIER_APIKey)).text
        storage_upload(article, filename)
    return article


def scopus_get_json(aid, url):
    filename = os.path.join('json', aid + '.json')
    data = storage_download(filename)
    if data is None:
        article = scopus_get_article(aid, url)
        entry = scopus_parse_entry(article)
        entry['BibMine-url'] = url
        entry['BibMine-aid'] = aid
        data = json.dumps(entry, sort_keys=True, indent=2, separators=(',', ': '))
        storage_upload(data, filename)
    entry = json.loads(data)
    return entry


def get_text(s, t):
    h = s.find(t)
    if h:
        return h.get_text()
    return ''

def get_full_text(doi):
    response = requests.get("https://api.elsevier.com/content/article/doi/{}&apiKey={}&httpAccept=text%2Fxml"
                            .format(doi, ELSEVIER_APIKey))
    if response.status_code == 404:
        return ''
    soup = Soup(response.content, features="lxml")
    return get_text(soup, 'ce:sections')


def scopus_parse_entry(article):
    soup = Soup(article, features="lxml")
    e = {}
    e['Title'] = get_text(soup, 'dc:title').replace('\n', ' ').replace('\t', ' ')
    e['PublicationName'] = get_text(soup, 'prism:publicationname')
    e['Type'] = get_text(soup, 'prism:aggregationtype')
    e['Subtype'] = get_text(soup, 'subtypedescription')
    if soup.find('ce:para') and soup.find('ce:para').parent.name == 'abstract':
        e['Abstract'] = get_text(soup, 'ce:para').replace('\n', ' ').replace('\t', ' ')
    else:
        e['Abstract'] = ''
    e['Volume'] = get_text(soup, 'prism:volume')
    e['Issue'] = get_text(soup, 'prism:issueidentifier')
    e['Page Range'] = get_text(soup, 'prism:pagerange')
    e['Date'] = get_text(soup, 'prism:coverdate')
    e['EID'] = get_text(soup, 'eid')
    e['DOI'] = get_text(soup, 'prism:doi')
    e['Content'] = get_full_text(e['DOI'])
    e['URL'] = 'https://doi.org/{}'.format(e['DOI'])

    # provided by authors
    e['Keywords'] = [keyword.get_text() for keyword in soup.find_all('author-keyword')
                     if keyword.parent.name == 'authkeywords']
    # provided by indexing service
    e['IndexTerms'] = [indexterm.get_text() for indexterm in soup.find_all('mainterm')
                       if indexterm.parent.name == 'idxterms']
    e['OpenAccess'] = get_text(soup, 'openaccess')
    if soup.find_all('ref-fulltext'):
        e['References'] = [ref.get_text().replace('\t', ' ') for ref in soup.find_all('ref-fulltext')]
    else:
        e['References'] = []
    if soup.find_all('subject-area'):
        e['SubjectAreas'] = [sarea.get_text().replace('\t', ' ') for sarea in soup.find_all('subject-area')
                             if sarea.parent.name == 'subject-areas']
    else:
        e['SubjectAreas'] = []
    e['CitedBy'] = get_text(soup, 'citedby-count')
    if soup.find(rel='scopus-citedby'):
        e['Scopus Cited By'] = soup.find(rel='scopus-citedby')['href']
    else:
        e['Scopus Cited By'] = ''
    e['Funding Acronym'] = get_text(soup, 'fund-acr')
    e['Funding Agency ID'] = get_text(soup, 'fund-no')
    e['Funding Agency'] = get_text(soup, 'fund-sponsor')
    e['Scopus Identifier'] = get_text(soup, 'dc:identifier')

    # authors information
    # sometimes authors is blank, and names are listed in <bibrecord><author-group>...</author-group></bibrecord>
    e['Authors'] = []
    if soup.find('authors'):
        authors = soup.find('authors').find_all('author')
        for author in authors:
            e['Authors'].append({
                'seq': author['seq'],
                'ce:indexed-name': get_text(author, 'ce:indexed-name'),
                'ce:given-name': get_text(author, 'ce:given-name'),
                'ce:surname': get_text(author, 'ce:surname'),
                'author-url': get_text(author, 'author-url'),
                'auid': author['auid'],
                'afid': [aff['id'] for aff in author.find_all('affiliation')],
            })
    # affiliation information
    e['Affiliations'] = []
    if soup.find('affiliation'):
        for aff in soup.find_all('affiliation'):
            if aff.parent.name == 'abstracts-retrieval-response':
                e['Affiliations'].append({
                    'afid': aff['id'],
                    'affilname': get_text(aff, 'affilname'),
                    'affiliation-city': get_text(aff, 'affiliation-city'),
                    'affiliation-country': get_text(aff, 'affiliation-country'),
                })

    return e


def flatten_for_pandas(d):
    nd = {}
    for k, vl in d.items():
        nvl = []
        for v in vl:
            if type(v) == list:
                v = "; ".join([str(e) for e in v])
            else:
                v = str(v)
            nvl.append(v)
        nd[k] = nvl
    return nd


def scopus_retrive(scopus_query, save_to_file=None):
    # get list of article ids
    entries = scopus_search(scopus_query)
    
    # download each article
    data = {}
    for i, e in enumerate(entries):
        if i % 200 == 0:
            print(i)
        url = e['prism:url']
        aid = url.split('/')[-1]

        e_data = scopus_get_json(aid, url)

        for k, v in e_data.items():
            data.setdefault(k, []).append(v)
    
    # convert to pandas and save
    data = flatten_for_pandas(data)
    df = pd.DataFrame.from_dict(data)
    if save_to_file:
        df.to_csv(save_to_file, index=False)
    return df