In [1]:
import os
import requests
import json
from bs4 import BeautifulSoup as Soup
import pandas as pd

Now provide the Elsevier API key and the query. API key can be found [here](https://dev.elsevier.com/apikey/manage). 

Query is constructed as described in the Elsevier instructions.
Boolean logic applies. Use AND to find both occurrences of the keywords, OR to find either occurrence and NOT to exclude a keyword. You can find detailed information [here](https://service.elsevier.com/app/answers/detail/a_id/25974/supporthub/sciencedirect/).

Example:

*(“market share” OR “leisure tourism”) AND stakeholders AND NOT space*

will return articles that contain keywords "market share stakeholders" or "leisure toursim stakeholders", but never the keyword "space". Please note that you need to use quotation marks to browse by phrase, e.g. "tourism research".

In [2]:
APIKey = ""
query = "tourism+AND+innovation"

In [3]:
data = {'Title': [],
        'Authors': [],
        'PublicationName': [],
        'Type': [],
        'Abstract': [],
        'Content': [],
        'Volume': [],
        'Issue': [],
        'Date': [],
        'Pages': [],
        'PII': [],
        'Keywords' : [],
        'URL' : [],
        'OpenAccess': [],
        'References': [],
        'CitedBy': [],
        'AuthorAUID': [],
        'AuthKeywords': [],
        'SubjectAreas': []
        }

Now we retrieve 6000 articles from Elsevier and parse them on the fly. We use Scopus to get 'cited by' information and keywords.

We also print progress as we go (at every 25th instance checked).

In [7]:
for i in range(0, 6000, 25):
    print("Retrieving {}/6000.".format(i))
    response = requests.get(
        "https://api.elsevier.com/content/search/sciencedirect?query={}&apiKey={}&start={}".format(query, APIKey, i))
    cont = json.loads(response.content)
    for j in cont["search-results"]["entry"]:
        url = j['prism:url']
        data['URL'].append(str(url))
        article = requests.get("{}?apiKey={}".format(url, APIKey))
        soup = Soup(article.content, features="lxml")
        
        data['Title'].append(soup.find('dc:title').get_text().replace('\n', ' ') if soup.find('dc:title') else '')
        authors = soup.find_all('dc:creator')
        data['Authors'].append('; '.join([item.get_text() for item in authors]) if authors else '')
        data['PublicationName'].append(soup.find('prism:publicationname').get_text() if 
                                        soup.find('prism:publicationname') else '')
        data['Type'].append(soup.find('prism:aggregationtype').get_text() if
                            soup.find('prism:aggregationtype') else '')
        data['Abstract'].append(soup.find('dc:description').get_text().replace('\n', ' ') 
                                if soup.find('dc:description') else '')
        data['Content'].append(soup.find('ce:sections').get_text().replace('\n', ' ') 
                               if soup.find('ce:sections') else '')
        data['Volume'].append(soup.find('prism:volume').get_text() if soup.find('prism:volume') else '')
        data['Issue'].append(soup.find('prism:issueidentifier').get_text() 
                             if soup.find('prism:issueidentifier') else '')
        data['Date'].append(soup.find('prism:coverdate').get_text() if soup.find('prism:coverdate') else '')
        data['Pages'].append(soup.find('prism:pagerange').get_text() if soup.find('prism:pagerange') else '')
        data['PII'].append(soup.find('pii').get_text() if soup.find('pii') else '')
        data['Keywords'].append('; '.join([res.get_text() for res in soup.find_all('dcterms:subject')] 
                                          if soup.find_all('dcterms:subject') else ''))
        data['OpenAccess'].append(soup.find('openaccess').get_text())
        data['References'].append('; '.join([label.get_text() for label in soup.find_all('ce:label') 
                                             if label.parent.name == 'ce:bib-reference']) 
                                  if soup.find_all('ce:label') else '')
        
        sc_id = soup.find('scopus-id')
        
        if sc_id:
            scopus = requests.get("https://api.elsevier.com/content/abstract/scopus_id/{}?apiKey={}".format(sc_id.get_text(), APIKey))
            soup2 = Soup(scopus.content, features="lxml")

            data['CitedBy'].append(soup.find('citedby-count').get_text() if soup.find('citedby-count') else '')
            data['AuthorAUID'].append('; '.join([auth['auid'] for auth in soup.find_all('author') 
                                                 if auth.parent.name == 'authors']) if soup.find_all('author') else '')
            data['AuthKeywords'].append('; '.join([label.get_text() for label in soup.find_all('author-keyword') 
                                                 if label.parent.name == 'authkeywords']) 
                                      if soup.find_all('author-keyword') else '')
            data['SubjectAreas'].append('; '.join([label.get_text() for label in soup.find_all('subject-area') 
                                                 if label.parent.name == 'subject-areas']) 
                                      if soup.find_all('subject-area') else '')
        else:
            data['CitedBy'].append('')
            data['AuthorAUID'].append('')
            data['AuthKeywords'].append('')
            data['SubjectAreas'].append('')
        
    df = pd.DataFrame.from_dict(data)
    df.to_csv(os.path.join(os.getcwd(), 'Innovation/Innovation_{}.csv'.format(i)), index=False)
    data = data.fromkeys(data, [])
        

Retrieving 0/6000.
0
Retrieving 25/6000.
25
Retrieving 50/6000.
50
Retrieving 75/6000.
75
Retrieving 100/6000.
100
Retrieving 125/6000.
125
Retrieving 150/6000.
150
Retrieving 175/6000.
175
Retrieving 200/6000.
200
Retrieving 225/6000.
225
Retrieving 250/6000.
250
Retrieving 275/6000.
275
Retrieving 300/6000.
300
Retrieving 325/6000.
325
Retrieving 350/6000.
350
Retrieving 375/6000.
375
Retrieving 400/6000.


KeyboardInterrupt: 