In [None]:
from datetime import date
import logging
import pandas
import time

import zenodo
import dblp

logging.basicConfig(level=logging.INFO, format='%(asctime)s: %(levelname)s - %(message)s')
logging.Formatter.converter = time.gmtime


In [None]:
ZENODO_RECORDS_UNFILTERED_JSON = './zenodo-records.json'
ZENODO_RECORDS_UNFILTERED_PICKLE = './zenodo-records.pickle'
ZENODO_RECORDS_WITH_VERIFIED_DBLP_AUTHOR_JSON = './zenodo-records-verified-dblp-author.json'

start_date = date(2022,8,1)
end_date = date(2022,11,1)

# Load and Store Records From Zenodo

In [None]:
hits = zenodo.load_dataset_records(start= start_date, end=end_date)

In [None]:
zenodo_records = pandas.json_normalize(hits)
zenodo_records.to_pickle(ZENODO_RECORDS_UNFILTERED_PICKLE)
zenodo_records.to_json(ZENODO_RECORDS_UNFILTERED_JSON, orient='records')
zenodo_records = pandas.read_pickle(ZENODO_RECORDS_UNFILTERED_PICKLE, orient='records')


# Transformation of Records
In the following, based on the loaded zenodo records, signatures (creator, record) are created.
These signatures are compared by orcid to verified DBLP authors (DBLP authors whos ORCID has been manually verified by the dblp team) based on an author snapshot from 24.11.2022.

In [None]:
# Prepare Zenodo Signatures
doi_signatures = zenodo_records[['doi', 'metadata.creators']].explode('metadata.creators')
doi_signatures['affiliation'] = doi_signatures['metadata.creators'].apply(lambda x: x.get('affiliation', None))
doi_signatures['name'] = doi_signatures['metadata.creators'].apply(lambda x: x.get('name', None))
doi_signatures['orcid'] = doi_signatures['metadata.creators'].apply(lambda x: x.get('orcid', None))
doi_signatures = doi_signatures[['doi', 'name', 'affiliation', 'orcid']]

In [None]:
orcids_zenodo = doi_signatures[['orcid']]
orcids_zenodo = orcids_zenodo[orcids_zenodo[ 'orcid'].notna()]

orcids_dblp_verifies = orcids_zenodo[orcids_zenodo.apply(lambda x: dblp.is_dblp_orcid(x['orcid']), axis=1)]
doi_signatures_dblp_verified = doi_signatures[doi_signatures['orcid'].isin(orcids_dblp_verifies['orcid'])]

zenodo_record_dblp_creator = doi_signatures_dblp_verified.join(zenodo_records.set_index('doi'), how='inner', on='doi')
zenodo_record_dblp_creator = zenodo_record_dblp_creator.reset_index()

# Export
We exported a dataset of zenodo records with at least one creator who has a orcid that is verified in dblp.
If a record has more than one verified dblp author, it once for each author in the following dataset.
See the following cells for metadata about:
* the number of unique records loaded, 
* the timeframe of the dataset,
* the number of unique records with at least one verified dblp author, 
* and the number of enriched, filtered, and exported zenodo records.

In [None]:
# Dataset metadata
print(f"Export path: {ZENODO_RECORDS_WITH_VERIFIED_DBLP_AUTHOR_JSON}")
print(f"Unique Zenodo Records loaded: {zenodo_records['doi'].nunique()}")
print(f"Zenodo record timeframe: from {zenodo_records['updated'].min()} to {zenodo_records['updated'].max()}")
print(f"Records in Zenodo Snapshot with at least one verified DBLP author: {zenodo_record_dblp_creator['doi'].nunique()}")
print(f"Exported Records (one entry per verified author): {zenodo_record_dblp_creator['doi'].count()}")

In [None]:
zenodo_record_dblp_creator.to_json(ZENODO_RECORDS_WITH_VERIFIED_DBLP_AUTHOR_JSON, orient='records')

In [None]:
pandas.read_json(ZENODO_RECORDS_WITH_VERIFIED_DBLP_AUTHOR_JSON, orient='records')