# Details

This Jupyter notebook is used to retrieve papers affiliated with Chulalongkorn from 2014 to 2017 by using Scopus API.

The retrieved papers will be in folder `papers` after you press "Run All". The process may take several hours.

Alternatively, you can also directly download the zip file of cleaned papers from [here](https://drive.google.com/file/d/1J1gk4LVZSjy-B2BDxy5SrwZ3LMHHmIRY/view?usp=sharing) and skip running `1_add_papers.ipynb`, `2_add_variables.ipynb`, and `3_clean_data.ipynb`.

# Import & create folder

In [None]:
# create folder "papers" to store the retrieve papers, which is in JSON format
!mkdir papers

In [None]:
# install "requests" if it is not on your computer
!pip3 install requests

In [None]:
import requests
import json

In [None]:
# a new API can be made key at https://dev.elsevier.com/apikey/manage")
apikey = "6eaf618799ef1af803cb82754e49d71a"

# Retrieve 2016 - 2017 papers

In [3]:
# set start and last paper to add here
# last is **exclusive** : you'll get paper00000.json - paper04999.json
start = 0
last = 5000

In [None]:
# *Exclusive end*
# For example, get_scopusid_dict(0, 300) will result in
# scopusIDlist.json with id = 0 to 299.0)

def get_scopusid_dict(start, end):
    scopus_dict = {}
    id = start
    untilbatch = end
    for i in range(start//25, untilbatch//25):
        response = requests.get(f"https://api.elsevier.com/content/search/scopus?start={i*25}&httpaccept=application/json&query=AFFIL ( chulalongkorn ) AND PUBYEAR > 2013 AND PUBYEAR < 2018&apiKey={apikey}")
        
        if str(response) == "<Response [429]>":
            print(f"Rate limit reached at get_scopusid_dict({start}, {end})! Please get a new API key at https://dev.elsevier.com/apikey/manage")
            return
        
        paperlist = json.loads(response.text)["search-results"]["entry"]
        for paper in paperlist:
            scopus_dict[id] = paper["prism:url"]
            id += 1
    with open("scopusIDlist.json", "w") as f: 
        json.dump(scopus_dict, f, indent=4)
    return scopus_dict

In [None]:
def get_paper_by_scopusAPI(id, url):

    number = "0"*(5 - len(str(id))) + str(id)

    response = requests.get(f"{url}?httpaccept=application/json&apiKey={apikey}")
    
    if str(response) == "<Response [429]>":
            print(f"Rate limit reached at get_paper_by_scopusAPI({id}, {url})! Please get a new API key at https://dev.elsevier.com/apikey/manage")
            return
    
    mydata = json.loads(response.text)
    with open(f"papers/paper{number}.json", "w") as f:
        f.write(json.dumps(mydata, indent=4))
        f.close()


In [None]:
for i in range(start//25, last//25):
      print(f"batch {i*25} to {i*25 + 24}")
      scopus_dict = get_scopusid_dict(i*25, i*25 + 25)
      
      with open("scopusIDlist.json") as f: 
            scopus_dict = json.load(f)

      for id, url in scopus_dict.items():
            get_paper_by_scopusAPI(id, url)

# Retrieve 2014 - 2015 papers

In [None]:
# Set start and last paper id to add here
# last is **exclusive** : you'll get paper00000.json - paper04174.json
start2 = 0
last2 = 4175

In [None]:
# *Exclusive end*
# For example, get_scopusid_dict(0, 300) will result in
# scopusIDlist.json with id = 0 to 299.

def get_scopusid_dict(start, end):
    scopus_dict = {}
    id = start
    untilbatch = end
    for i in range(start//25, untilbatch//25):
        response = requests.get(f"https://api.elsevier.com/content/search/scopus?start={i*25}&httpaccept=application/json&query=AFFIL ( chulalongkorn ) AND PUBYEAR > 2013 AND PUBYEAR < 2016&apiKey={apikey}")
        
        if str(response) == "<Response [429]>":
            print(f"Rate limit reached at get_scopusid_dict({start}, {end})! Please get a new API key at https://dev.elsevier.com/apikey/manage")
            return
        
        paperlist = json.loads(response.text)["search-results"]["entry"]
        
        for paper in paperlist:
            scopus_dict[id] = paper["prism:url"]
            id += 1
            
    with open("scopusIDlist.json", "w") as f: 
        json.dump(scopus_dict, f, indent=4)
    return scopus_dict

In [None]:
def get_paper_by_scopusAPI_before2016(id, url):

    total = int(id) + 4857  # Have to +4857 so that the paper id is 
    number = "0"*(5 - len(str(total))) + str(total)

    response = requests.get(f"{url}?httpaccept=application/json&apiKey={apikey}")
    
    if str(response) == "<Response [429]>":
        print(f"Rate limit reached at get_paper_by_scopusAPI({id}, {url})! Please get a new API key at https://dev.elsevier.com/apikey/manage")
        return
    
    mydata = json.loads(response.text)
    with open(f"papers/paper{number}.json", "w") as f:
        f.write(json.dumps(mydata, indent=4))
        f.close()


In [None]:
for i in range(start2//25, last2//25):
      print(f"batch {i*25} to {i*25 + 24}")
      scopus_dict = get_scopusid_dict(i*25, i*25 + 25)
      
      with open("scopusIDlist.json") as f: 
            scopus_dict = json.load(f)

      for id, url in scopus_dict.items():
            get_paper_by_scopusAPI_before2016(id, url)

# Delete unnecessary files

In [None]:
!rm scopusIDlist.json