# Details

This Jupyter notebook is used to clean papers, both from the given dataset and the ones from Scopus API.

Before running, please configure the path for each dataset in its section. The default path assumes you have the given dataset in folder `Data 2018-2023` and the scraped dataset in folder `papers`.

Alternatively, you can also directly download the zip file of cleaned papers from [here](https://drive.google.com/drive/folders/1zEumhkVybecdpP9SETxyF5gMEYWHQbdJ?usp=sharing) and skip running `1_add_papers.ipynb`, `2_add_variables.ipynb`, and `3_clean_data.ipynb`.

# Clean given dataset

In [None]:
import pandas as pd
import json
from os import listdir
from os.path import isfile, join

In [None]:
# all file names
path = './Data 2018-2023/2023/'
fileNames = [f for f in listdir(path) if isfile(join(path, f))]

In [None]:
# create cleaned data
for i in range(len(fileNames)):
    with open(path+fileNames[i], 'r', encoding="utf8") as file:
        data = json.load(file)
        if 'dc:title' not in data['abstracts-retrieval-response']['coredata']:
            continue
        print('creating: '+fileNames[i])
        eid = data['abstracts-retrieval-response']['coredata']['eid']
        title = data['abstracts-retrieval-response']['coredata']['dc:title']
        yyyy_mm_dd = data['abstracts-retrieval-response']['coredata']['prism:coverDate']
        subjAreas = data['abstracts-retrieval-response']['subject-areas']['subject-area']#[index]['@code']
        citedByCount = str(data['abstracts-retrieval-response']['coredata']['citedby-count'] or '0')
        
        keywords = []
        if data['abstracts-retrieval-response']['authkeywords'] is not None:
            if type(data['abstracts-retrieval-response']['authkeywords']['author-keyword']) is not list:
                keywords = data['abstracts-retrieval-response']['authkeywords']['author-keyword']['$'].split(', ')
            else:
                keywords = [k['$'] for k in data['abstracts-retrieval-response']['authkeywords']['author-keyword']]
        
        authors = []
        for author in data['abstracts-retrieval-response']['authors']['author']:
            authorName = str(author['preferred-name']['ce:given-name'] or '') +' '+ str(author['preferred-name']['ce:surname'] or '')
            authors.append(authorName)

        refCount = 0
        refList = []
        if data['abstracts-retrieval-response']['item']['bibrecord']['tail'] != None:
            refCount = data['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']['@refcount']
            if type(data['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']['reference']) is not list:
                if 'ref-title' in data['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']['reference']['ref-info']:
                    refList.append(data['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']['reference']['ref-info']['ref-title']['ref-titletext'])
            else:
                for AData in data['abstracts-retrieval-response']['item']['bibrecord']['tail']['bibliography']['reference']:
                    if 'ref-title' in AData['ref-info']:
                            refList.append(AData['ref-info']['ref-title']['ref-titletext'])
        aff = data['abstracts-retrieval-response']['affiliation']

        fund = []
        if 'xocs:meta' in data['abstracts-retrieval-response']['item'] and 'xocs:funding' in data['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']:
            # print('have fund')
            if type(data['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']['xocs:funding']) is not list:
                if 'xocs:funding-agency-matched-string' in data['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']['xocs:funding']:
                    fund.append(data['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']['xocs:funding']['xocs:funding-agency-matched-string'])
            else:
                for a in data['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']['xocs:funding']:
                    if 'xocs:funding-agency-matched-string' in a:
                        fund.append(a['xocs:funding-agency-matched-string'])

    dictionary = {
        'eid': eid,
        'title': title,
        'cover-date': yyyy_mm_dd,
        'subject-areas': subjAreas,
        'keywords': keywords,
        'authors': authors,
        'citedby-count': int(citedByCount),
        'ref-count': int(refCount),
        'ref-list': refList,
        'affiliation':aff,
        'funding': fund
    }
    
    # Serializing json
    json_object = json.dumps(dictionary, indent=4)

    with open("./Data 2018-2023/clean/"+fileNames[i], "w", encoding="utf8") as outfile:
        outfile.write(json_object)

# Clean scraped dataset

In [None]:
import pandas as pd
import json
from os import listdir
from os.path import isfile, join

In [None]:
# all file names
path = './papers/'
fileNames = [f for f in listdir(path) if isfile(join(path, f))]

In [None]:
# create cleaned data
for i in range(len(fileNames)):
    with open(path+fileNames[i], 'r', encoding="utf8") as file:
        data = json.load(file)
        if 'dc:title' not in data['abstracts-retrieval-response']['coredata']:
            continue
        if 'dc:creator' not in data['abstracts-retrieval-response']['coredata']:
            continue
        print('creating: '+fileNames[i])
        eid = data['abstracts-retrieval-response']['coredata']['eid']
        title = data['abstracts-retrieval-response']['coredata']['dc:title']
        yyyy_mm_dd = data['abstracts-retrieval-response']['coredata']['prism:coverDate']
        subjAreas = data['subject-areas']
        citedByCount = str(data['abstracts-retrieval-response']['coredata']['citedby-count'] or '0')
        
        keywords = []
        if 'keywords' in data:
            keywords = data['keywords']
        
        authors = set() # some authors in dc:creator have many affiliation -> repeated names
        for author in data['abstracts-retrieval-response']['coredata']['dc:creator']['author']:
            authorName = str(author['preferred-name']['ce:given-name'] or '') +' '+ str(author['preferred-name']['ce:surname'] or '')
            authors.add(authorName)

        refCount = 0
        if 'ref-count' in data:
            refCount = data['ref-count']
        refList = []
        if 'ref-list' in data:
            refList = data['ref-list']
        aff = data['abstracts-retrieval-response']['affiliation']

        fund = data['funding']

    dictionary = {
        'eid': eid,
        'title': title,
        'cover-date': yyyy_mm_dd,
        'subject-areas': subjAreas,
        'keywords': keywords,
        'authors': list(authors),
        'citedby-count': int(citedByCount),
        'ref-count': int(refCount),
        'ref-list': refList,
        'affiliation':aff,
        'funding': fund
    }
    
    # Serializing json
    json_object = json.dumps(dictionary, indent=4)

    with open("./cleanPapers/"+fileNames[i], "w", encoding="utf8") as outfile:
        outfile.write(json_object)

# Renaming files

Now, please move all the papers into one new folder and run this code to rename all of them:

In [None]:
# Please insert your full path to the new folder
path="./sorted_papers"

In [None]:
import json
from os import listdir
from os.path import isfile, join

fileNames = [f for f in listdir(path) if isfile(join(path, f))]

for i in range(0, 29217):
    with open(f"{path}/{fileNames[i]}", 'r', encoding="utf-8") as file:
        d = json.load(file)
        number = "0"*(5 - len(str(i))) + str(i)
        with open(f"./sorted_papers/{number}.json", "w", encoding="utf-8") as newfile:
            newfile.write(json.dumps(d, indent=4))
        