In [27]:
import os
import requests
import json
from bs4 import BeautifulSoup as Soup
import pandas as pd

Now provide the Elsevier API key and the query. API key can be found [here](https://dev.elsevier.com/apikey/manage). 

Query is constructed as described in the Elsevier instructions.
Boolean logic applies. Use AND to find both occurrences of the keywords, OR to find either occurrence and NOT to exclude a keyword. You can find detailed information [here](https://service.elsevier.com/app/answers/detail/a_id/25974/supporthub/sciencedirect/).

Example:

*(“market share” OR “leisure tourism”) AND stakeholders AND NOT space*

will return articles that contain keywords "market share stakeholders" or "leisure toursim stakeholders", but never the keyword "space". Please note that you need to use quotation marks to browse by phrase, e.g. "tourism research".

In [28]:
APIKey = ""
query = "tourism+AND+innovation"

We build the data structure with the following attributes:

In [89]:
data = {'Title': [],
        'Authors': [],
        'PublicationName': [],
        'Type': [],
        'Abstract': [],
        'Content': [],
        'Volume': [],
        'Issue': [],
        'Date': [],
        'Pages': [],
        'PII': [],
        'Keywords' : [],
        'URL' : [],
        'OpenAccess': [],
        'References': [],
        'CitedBy': [],
        'AuthorAUID': [],
        'AuthKeywords': [],
        'SubjectAreas': []
        }

Now we retrieve 6000 articles from Elsevier and parse them on the fly. We use Scopus to get 'cited by' information and keywords.

We also print progress as we go (at every 25th instance checked).

In [90]:
for i in range(0, 6000, 25):
    print(i)
    response = requests.get(
        "https://api.elsevier.com/content/search/sciencedirect?query={}&apiKey={}&start={}".format(query, APIKey, i))
    cont = json.loads(response.content)
    for i in cont["search-results"]["entry"]:
        url = i['prism:url']
        data['URL'].append(str(url))
        article = requests.get("{}?apiKey={}".format(url, APIKey))
        soup = Soup(article.content, features="lxml")
        
        data['Title'].append(soup.find('dc:title').get_text().replace('\n', ' ') if soup.find('dc:title') else '')
        authors = soup.find_all('dc:creator')
        data['Authors'].append('; '.join([item.get_text() for item in authors]) if authors else '')
        data['PublicationName'].append(soup.find('prism:publicationname').get_text() if 
                                        soup.find('prism:publicationname') else '')
        data['Type'].append(soup.find('prism:aggregationtype').get_text() if
                            soup.find('prism:aggregationtype') else '')
        data['Abstract'].append(soup.find('dc:description').get_text().replace('\n', ' ') 
                                if soup.find('dc:description') else '')
        data['Content'].append(soup.find('ce:sections').get_text().replace('\n', ' ') 
                               if soup.find('ce:sections') else '')
        data['Volume'].append(soup.find('prism:volume').get_text() if soup.find('prism:volume') else '')
        data['Issue'].append(soup.find('prism:issueidentifier').get_text() 
                             if soup.find('prism:issueidentifier') else '')
        data['Date'].append(soup.find('prism:coverdate').get_text() if soup.find('prism:coverdate') else '')
        data['Pages'].append(soup.find('prism:pagerange').get_text() if soup.find('prism:pagerange') else '')
        data['PII'].append(soup.find('pii').get_text() if soup.find('pii') else '')
        data['Keywords'].append('; '.join([res.get_text() for res in soup.find_all('dcterms:subject')] 
                                          if soup.find_all('dcterms:subject') else ''))
        data['OpenAccess'].append(soup.find('openaccess').get_text())
        labels = soup.find_all('ce:label')
        data['References'].append('; '.join([label.get_text() for label in soup.find_all('ce:label') 
                                             if label.parent.name == 'ce:bib-reference']) 
                                  if soup.find_all('ce:label') else '')
        
        sc_id = soup.find('scopus-id')
        
        if sc_id:
            scopus = requests.get("https://api.elsevier.com/content/abstract/scopus_id/{}?apiKey={}".format(sc_id.get_text(), APIKey))
            soup2 = Soup(scopus.content, features="lxml")

            data['CitedBy'].append(soup.find('citedby-count').get_text() if soup.find('citedby-count') else '')
            data['AuthorAUID'].append('; '.join([auth['auid'] for auth in soup.find_all('author') 
                                                 if auth.parent.name == 'authors']) if soup.find_all('author') else '')
            data['AuthKeywords'].append('; '.join([label.get_text() for label in soup.find_all('author-keyword') 
                                                 if label.parent.name == 'authkeywords']) 
                                      if soup.find_all('author-keyword') else '')
            data['SubjectAreas'].append('; '.join([label.get_text() for label in soup.find_all('subject-area') 
                                                 if label.parent.name == 'subject-areas']) 
                                      if soup.find_all('subject-area') else '')
        else:
            data['CitedBy'].append('')
            data['AuthorAUID'].append('')
            data['AuthKeywords'].append('')
            data['SubjectAreas'].append('')
        

0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650
675
700
725
750
775
800
825
850
875
900
925
950
975
1000
1025
1050
1075
1100
1125
1150
1175
1200
1225
1250
1275
1300
1325
1350
1375
1400
1425
1450
1475
1500
1525
1550
1575
1600
1625
1650
1675
1700
1725
1750
1775
1800
1825
1850
1875
1900
1925
1950
1975
2000
2025
2050
2075
2100
2125
2150
2175
2200
2225
2250
2275
2300
2325
2350
2375
2400
2425
2450
2475
2500
2525
2550
2575
2600
2625
2650
2675
2700
2725
2750
2775
2800
2825
2850
2875
2900
2925
2950
2975
3000
3025
3050
3075
3100
3125
3150
3175
3200
3225
3250
3275
3300
3325
3350
3375
3400
3425
3450
3475
3500
3525
3550
3575
3600
3625
3650
3675
3700
3725
3750
3775
3800
3825
3850
3875
3900
3925
3950
3975
4000
4025
4050
4075
4100
4125
4150
4175
4200
4225
4250
4275
4300
4325
4350
4375
4400
4425
4450
4475
4500
4525
4550
4575
4600
4625
4650
4675
4700
4725
4750
4775
4800
4825
4850
4875
4900
4925
4950
4975
5000
5025
5050
5075
5100
5125
5150
5175
5200


Save data locally.

In [91]:
df = pd.DataFrame.from_dict(data)
df.to_csv(os.path.join(os.getcwd(), 'Stakeholders.csv'), index=False)