# Details

This Jupyter notebook is used to add the 5 variables to papers from `1_add_papers.ipynb`: `citedby-count`, `keywords`, `funding`, `ref-count`, and `ref-list`.

The process may take several hours because of function `add_other_variables()` which uses Selenium to scrape data.

Alternatively, you can also directly download the zip file of cleaned papers from [here](https://drive.google.com/file/d/1J1gk4LVZSjy-B2BDxy5SrwZ3LMHHmIRY/view?usp=sharing) and skip running `1_add_papers.ipynb`, `2_add_variables.ipynb`, and `3_clean_data.ipynb`.

# imports

In [None]:
!pip3 install selenium
!pip3 install bs4
!pip3 install requests

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
import json
import time
import csv

# add subject-area

In [None]:
# convert ASJC.csv to ASJC.json
# key is asjc, value is subject area

asjcdict = {}
with open("csv_and_json/ASJC.csv", encoding="utf-8") as csvf:
    csvReader = csv.DictReader(csvf)
    for row in csvReader:
        asjcdict[row['Code']] = row['Description']

with open("csv_and_json/ASJC.json", "w", encoding="utf-8") as jsonf:
    jsonf.write(json.dumps(asjcdict, indent=4))

In [None]:
# convert sources.csv to .json

sourcesdict = {}
with open("csv_and_json/sources.csv", encoding="utf-8") as csvf:
    csvReader = csv.DictReader(csvf)
    for row in csvReader:
        sourcesdict[row['SourceID']] = [e.strip() for e in row['ASJC'].split(";")]

with open("csv_and_json/sources.json", "w", encoding="utf-8") as jsonf:
    jsonf.write(json.dumps(sourcesdict, indent=4))

In [None]:
with open("csv_and_json/sources.json") as f:
    all_sources = json.load(f)
    
with open("csv_and_json/ASJC_bycode.json") as f:
    asjc_bycode = json.load(f)

In [None]:
def add_asjc(id):
    number = "0"*(5 - len(str(id))) + str(id)

    with open(f"papers/paper{number}.json") as f:
        paper = json.load(f)

    thisasjc = []
    source_id = paper["abstracts-retrieval-response"]["coredata"]["source-id"]
    if source_id in all_sources:
        asjc_list = all_sources[source_id]
        for asjc in asjc_list:
            if asjc in asjc_bycode:
                thisasjc.append({
                    "$": asjc_bycode[asjc],
                    "@code": asjc
                })


    paper["subject-areas"] = thisasjc
    with open(f"papers/paper{number}.json", "w") as f:
        f.write(json.dumps(paper, indent=4))
        f.close()
   


In [None]:
# add to all papers
for i in range(0,9020):
    print(i , end=" ")
    add_asjc(i)

# driver & sign in to scopus
- must run before calling add_other_variables()
- select either Chrome or Firefox for your driver
- type your email and password for scopus account (**NOT** CUNET password)

In [1]:
# # For Chrome:
# chrome_options = webdriver.ChromeOptions()
# driver = webdriver.Chrome(options=chrome_options)

In [11]:
# For Firefox:
from selenium.webdriver.firefox.options import Options as FirefoxOptions

firefox_options = FirefoxOptions()
driver = webdriver.Firefox(options=firefox_options)

In [None]:
# your email and password for scopus account here
email = "..@student.chula.ac.th"
password = "verylongpassword"

In [13]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [14]:
driver.get(url="https://www.scopus.com/home.uri")

wait = WebDriverWait(driver, 10)
check_access = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "[href='/checkaccess.uri']")))
driver.execute_script("arguments[0].click();", check_access)

accept_cookie = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#onetrust-accept-btn-handler")))
driver.execute_script("arguments[0].click();", accept_cookie)

org_email = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#bdd-email")))
org_email.clear()
org_email.send_keys(email)
time.sleep(1)
submit_email = driver.find_element(By.CSS_SELECTOR, "#bdd-els-searchBtn")
driver.execute_script("arguments[0].click();", submit_email)

org_password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#bdd-password")))
org_password.clear()
org_password.send_keys(password)
time.sleep(1)
signin = driver.find_element(By.CSS_SELECTOR, "#bdd-elsPrimaryBtn")
driver.execute_script("arguments[0].click();", signin)


# add other variables
- citedby-count
- keywords
- funding
- ref-count
- ref-list

In [None]:
def add_other_variables(id):
    number = "0"*(5 - len(str(id))) + str(id)
    with open(f"papers/paper{number}.json") as f:
        paper = json.load(f)
        toself = paper["abstracts-retrieval-response"]["coredata"]["link"][1]["@href"]
        driver.get(url=toself)
        time.sleep(2)
        html = driver.page_source
        soup = BeautifulSoup(html, "lxml")
        # if error, check if you have lxml installed. if not, `!pip install lxml`
        # or replace "lxml with "html.parser" (python's parser) 

        if soup.select_one("#recordPageBoxes") != None:
            citedby_count = soup.select_one("#recordPageBoxes div div h3").string.split()[2]
            paper["citedby-count"] = citedby_count

        if soup.select("#funding-details div table tbody tr td p") != None:
            funding_list = [e.string for e in soup.select("#funding-details div table tbody tr td p")]
            # no funding detail: returns []
            paper["funding"] = funding_list

        if soup.select_one("#author-keywords") != None:
            kw = soup.select_one("#author-keywords").next_sibling
            keyword_list = []
            for e in kw.children:
                keyword_list.append(e.span.span.string)
            paper["keywords"] = keyword_list

        if soup.select_one("#references") != None:
            ref_count = soup.select_one("#references").parent.text.split()[1].strip("()")
            paper["ref-count"] = ref_count

            rf = soup.select("#referenceListRowId td div")
            ref_list = []
            for ref in rf:
                if ref.attrs["class"] == ['refAuthorTitle']:
                    if ref.em != None:
                        ref_list.append(ref.em.string)
                elif ref.attrs["class"] == ['refDocTitle', 'fontMedium']:
                    ref_list.append(ref.a.string)
            # len(ref_list) is not always equal to ref_count as some reference does not have a title
            paper["ref-list"] = ref_list
        
        with open(f"papers/paper{number}.json", "w") as f:
            f.write(json.dumps(paper, indent=4))
            f.close()


In [None]:
# add to all papers
# ~8-10 seconds per paper
for i in range(0,9020):
    print(i , end=" ")
    add_other_variables(i)

In [None]:
# quit driver
driver.quit()

# optional: add 3 variables (fast)
`citedby-count`, `funding`, and `keywords`

In [None]:
!pip3 install cloudscraper

In [None]:
import cloudscraper
scraper = cloudscraper.create_scraper()

In [None]:
def add_3_variables_fast(id):
    number = "0"*(5 - len(str(id))) + str(id)
    with open(f"papers/paper{number}.json") as f:
        paper = json.load(f)
        toself = paper["abstracts-retrieval-response"]["coredata"]["link"][1]["@href"]
        page = scraper.get(toself).text
        soup = BeautifulSoup(page, "lxml")
        # if error, try "html.parser" instead of "lxml"

        if soup.select_one("#recordPageBoxes") != None:
            citedby_count = soup.select_one("#recordPageBoxes div div h3").string.split()[2]
            paper["citedby-count"] = citedby_count

        if soup.select("#fundingDetails table tbody tr") != None:
            funding_list = []
            count = 0
            for td in soup.select("#fundingDetails table tbody tr td"):
                if td.string != None and count%3 == 0: 
                    funding_list.append(td.string)
                count += 1
            # no funding detail: returns []
            paper["funding"] = funding_list

        if soup.select_one("#authorKeywords") != None:
            kw = soup.select("#authorKeywords span")
            keyword_list = []
            for e in kw:
                keyword_list.append(e.string)
            paper["keywords"] = keyword_list


        with open(f"papers/paper{number}.json", "w") as f:
            f.write(json.dumps(paper, indent=4))
            f.close()



In [None]:
# add to all papers
# ~0.5 - 1 second per paper
for i in range(0,9020):
    print(i , end=" ")
    add_3_variables_fast(i)