# Gathering Bibliometric Information
In order make an analyisis and correlation between the reproducability, the paper citations and the scientific background of the authors possible bibliometric information is needed. This information is mostly gathered via the `Scopus API`. 

The follwing data was collected via the API based on the DOI of the papers:
- citation-count
- authors
    - author-url
    - given-name
    - surname
    - document-count
    - cited-by-count
    - citation-count
    - h-index
    - current-affiliation
    - first-affiliation
    - subject-area

This was done with the code provided below. However, three DOIs could not be found via the API. This information was retrieved manually (details below at the specific cell). 

Furthermore the type of affiliation of each of the authors was classified by two researchers manually (see `affil-type` variable). The classes where "University", "Industry" and "non-academic research". 

## Setup

In [1]:
# Imports
import json
from tqdm import tqdm
import requests
import pandas as pd
import os
import yaml
from dotenv import load_dotenv
import os


In [2]:
# Papameters
save_data = False # Saves data to system
use_local_data = True # Uses local data inestead of the API fetch
local_data_path = '../data/local/' # Path to local data 
os.makedirs(local_data_path, exist_ok=True)

# Scopus API Calling function and parsers



In [3]:
def get_publication_data(doi, headers, collect_author_data=True):
    api_call = f'http://api.elsevier.com/content/search/scopus?query=DOI({doi})&field=citedby-count,author-url'
    response = requests.get(api_call, headers=headers).json().get("search-results", {}).get('entry', [{}]) 

    response = [entry for entry in response if 'author' in entry][0]
    publication_data = None
    try:
        authors = response.get("author", [{}])
        author_urls = [author.get("author-url", {}) for author in authors]
        citation_count = response.get("citedby-count", 'Unknown')
        if collect_author_data:
            authors_data = [get_author_data(author_url, headers) for author_url in author_urls]
        publication_data = {
            "doi": doi,
            "citation-count": citation_count,
            "authors": authors_data if collect_author_data else author_urls, 
        }
    except Exception as e:
        print(f'Doi: {doi}')
        print(e)
    return publication_data

In [4]:
def get_author_data(author_url, headers):
    response = (requests
                .get(f'{author_url}?view=ENHANCED&field=affiliation-current,affiliation-history,given-name,surname,cited-by-count,citation-count,document-count,subject-areas,h-index', headers=headers)
                .json()
                .get("author-retrieval-response", [{}])
                [0]
                )
    author_id = author_url.split('/')[-1]
    response_author = (requests
                .get(f'https://api.elsevier.com/content/search/author?query=au-id({author_id})', headers=headers)
                .json()
                )

    author_id = author_url.split('/')[-1]
    # first_affiliation, affiliations = get_author_first_affiliation(author_id, headers)
    affiliation_keys = ['affiliation-name', 'affiliation-city', 'affiliation-country']
    
    author_data = None
    try:
        affiliation = response.get('affiliation-current', {})
        author_data = {
            "author-url": author_url,
            "given-name": response.get('preferred-name', {}).get('given-name', 'Unknown'),
            "surname": response.get('preferred-name', {}).get('surname', 'Unknown'),
            "document-count": response.get('coredata', {}).get('document-count', 'Unknown'),
            "cited-by-count": response.get('coredata', {}).get('cited-by-count', 'Unknown'),
            "citation-count": response.get('coredata', {}).get('citation-count', 'Unknown'),
            "h-index": response.get('h-index', 'Unknown'),
            "current-affiliation": ', '.join([str(affiliation.get(key, 'Unknown')) for key in affiliation_keys]),
            # "first-affiliation": first_affiliation,
            "subject-area": parse_subjects(response_author),
        }
    except Exception as e:
        print(f'Author url: {author_url}')
        print('Response:', response)
        print('Response all:', response_author)
        print(e)
    return author_data

In [5]:
def get_author_first_affiliation(author_id, headers):
    response = requests.get(f'https://api.elsevier.com/content/search/scopus?query=AU-ID({author_id})&sort=pubyear', headers=headers).json()
    first_affil = None
    affils = []
    try:
        affiliation_keys = ['affilname', 'affiliation-city', 'affiliation-country']

        affiliation = response.get('search-results', {}).get('entry', [{}])[0].get('affiliation', [{}])[0]
        first_affil = ', '.join([str(affiliation.get(key, 'Unknown')) for key in affiliation_keys])

        affiliations = response.get('search-results', {}).get('entry', [{}])[0].get('affiliation', [{}])
        for affiliation in affiliations:
            affils.append(', '.join([str(affiliation.get(key, 'Unknown')) for key in affiliation_keys]))
        
    except Exception as e:
        print(f'Author id: {author_id}')
        print(e)
    return first_affil, affils

In [6]:
def parse_subjects(response):
    subjects = response.get("search-results", [{}]).get("entry", [{}])[0].get("subject-area", 'Unknown')
    subjects_dict = {}
    if type(subjects) is list:
        for subject in subjects:
            subjects_dict[subject.get('$')] = subject.get('@frequency', 'Unknown')
    elif type(subjects) is dict:
        subjects_dict[subjects.get('$')] = subjects.get('@frequency', 'Unknown')
    else:
        subjects_dict['Unknown'] = 'Unknown'
        print('Unknown subjects format:', subjects, type(subjects))
    return subjects_dict


# Gathering Information

In order to have access to the Scopus API an APIKey must be availalbe. Caution you need to have access to scopus (maybe universtity VPN requiered)

Therefore an `.env` file that contains a line like: API_KEY=your_api_key_here should be created. Afterwards load the API key from the `.env` file for security and flexibility.
This keeps the API key out of the source code and version control

In [7]:
# Load API key from dotenv file
load_dotenv() 

api_key = os.getenv("API_KEY")

headers = {
    "X-ELS-APIKey": f'{api_key}',
    "Accept": "application/json"
} 

## Retrieve publication and author data
Retrieve informations of all DOIs listed in the `papers_reviewed_cleaned.csv`file.

In addition store all DOIs that could not be found in an list for later manual retrievel. 

In [8]:
# Load DOIs
DOIs = pd.read_csv('../data/papers_reviewed_reprod_variables_categoric.csv', usecols=['DOI_short'])
publication_data = []
notFoundDOIs = []

if not use_local_data:

    for doi in tqdm(DOIs.values, desc='Fetching Scopus API data'):
        try:
            publication_data.append(get_publication_data(doi[0], headers, collect_author_data=True))
        except Exception as e:
            print(f'Error fetching data for DOI: {doi[0]}')
            print(e)
            notFoundDOIs.append(doi[0])
        
    print(f'Number of DOIs not found: {len(notFoundDOIs)}')
    print(f'Not found DOIs: {notFoundDOIs}')

    if save_data:
        try:
            with open(f'{local_data_path}publication_data.json', 'w') as json_file:
                json.dump(publication_data, json_file, indent=4)
        except Exception as e:
            print(f'Error saving data: {e}')

else:
    with open(f'{local_data_path}publication_data.json', 'r') as json_file:
        publication_data = json.load(json_file)


## Retrieve not (automatically) found DOIs manually

Three papers could not be found via the Scopus API search. Therefore the information of these publications needed to be retrieved manually and added to the `missing_information` variable below. 

The latter two publication (`https://dl.acm.org/doi/10.5555/2873021.2873029` and `10.26868/25222708.2019.210311`) could be found via manual search on the Scopus webpage. Where the citations and author-urls where collected from. The third publication could only be found on `https://dl.acm.org/`. Regarding the citaitons of the publication different values where found, `adl.acm.org` listed 0 citations whereas Google Scholar listed 1 citation (see `https://scholar.google.com/scholar?cites=11212822057543812296&as_sdt=2005&sciodt=0,5&hl=de`). Subsiquently the Google Scholar value of 1 was used. The authors were manually search in the scopus database (webfrontend) and added with there author-url. 

In [9]:
missing_information = [
    {
        "doi": "https://dl.acm.org/doi/10.5555/3615924.3623632",
        "citation-count": "1",
        "authors": [
            {"author-url": "https://api.elsevier.com/content/author/author_id/58172106500"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/57205485739"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/6602631556"}
        ]
    },
    {
        "doi": "https://dl.acm.org/doi/10.5555/2873021.2873029",
        "scopus-publication-id": "84937434978",
        "citation-count": "9",
        "authors": [
            {"author-url": "https://api.elsevier.com/content/author/author_id/56727125200"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/56727971700"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/57212926253"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/36701070400"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/36701070400"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/56728508600"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/57217189030"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/56728761200"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/54079611600"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/36702009900"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/7404910952"}
        ]
    },
    {
        "doi": "10.26868/25222708.2019.210311",
        "scopus-publication-id": "85103634671",
        "citation-count": "7",
        "authors": [
            {"author-url": "https://api.elsevier.com/content/author/author_id/57207960636"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/15831158200"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/55673143000"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/57207965928"},
            {"author-url": "https://api.elsevier.com/content/author/author_id/57207965458"}
        ]
    }
]


In [None]:
if not use_local_data:
    for paper in tqdm(missing_information):
        for author in paper['authors']:
            if 'author-url' in author:
                author_url = author['author-url']
                author_data = get_author_data(author_url, headers)
                if author_data is None:
                    continue
                author.update(author_data)

    if save_data:
        with open(f'{local_data_path}missing_information.json', 'w') as json_file:
            json.dump(missing_information, json_file, indent=4)
else:
    with open(f'{local_data_path}missing_information.json', 'r') as json_file:
        missing_information = json.load(json_file)

## Combining both publication data files

In [11]:
publication_data_combined = publication_data + missing_information


# Adding the affiliation type manually

In addition the the informations retrieved above the type of alliation should be investigated. Therefore two researchers have manually classified all authors into either `University`, `Industry` or `non-academic research`. The labelling data can be found in the data folder as `affil_types.csv`.

The combined file the publication data with the affilation types is stored as `bibliometric_data.json`. 

In [12]:
with open('../data/affil_types.json', 'r') as json_file:
    affil_types = json.load(json_file)


In [13]:
for paper in publication_data_combined:
    for author in paper['authors']:
        affil_type = affil_types[author['author-url']]
        author['affil-type'] = affil_type

In [14]:
# Commented out to avoid overwriting the file
if save_data:
    with open('../data/bibliometric_data.json', 'w') as json_file:
        json.dump(publication_data_combined, json_file, indent=4)