In [None]:
from datetime import datetime, timezone
import requests
import pandas
import json
import time
import re

In [None]:
pandas.options.mode.chained_assignment = None  # default='warn'

In [None]:
unique_hits = []
with open('./hits_unique.json') as file:
    for line in file:
        unique_hits.append(json.loads(line))

unique_historic_hits = []
with open('./hits_unique_historic.json') as file:
    for line in file:
        unique_historic_hits.append(json.loads(line))

non_unique_hits = []
with open('./hits.json') as file:
    for line in file:
        non_unique_hits.append(json.loads(line))



In [None]:
## HELPER METHODS

def get_number_of_dblp_records(orcid: str):
    r = requests.get(f'http://dblp2.uni-trier.de/orcid/{orcid}.xml?view=ajax')
    if (r.status_code == 404):
        return 0
    if (not r.ok):
        raise Exception(f"{str} got an {r.status_code}")
    z = re.search(r'n="(\d*)"', r.text)
    records_in_dblp = int(z.group(1))
    time.sleep(0.1)
    return records_in_dblp


def get_dblp_pid_by_orcid(orcid: str):
    r = requests.get(f'http://dblp2.uni-trier.de:8000/?q=:facetid:eid:"ORCID:{orcid}"&format=json&p=1')
    if (r.status_code == 404):
        return None
    if (not r.ok):
        raise Exception(f"{str} got an {r.status_code}")
    record = r.json()['result']['hits']['hit'][0]['info']['record']
    z = re.search(fr'<(?:author|editor) pid="([^"]*)" orcid="{orcid}">', record)
    if(z is None):
        print(r.text)
        raise Exception()
    pid = z.group(1)
    return pid, None


def get_dblp_record_count_by_pid(pid: str):
    r = requests.get(f"https://dblp2.uni-trier.de/pid/{pid}.xml?view=ajax")
    if (r.status_code == 404):
        return None
    if (not r.ok):
        raise Exception(f"{str} got an {r.status_code}")
    z = re.search(r'n="(\d*)"', r.text)
    if(z is None):
        print(r.text)
        raise Exception()
    records_in_dblp = int(z.group(1))
    return records_in_dblp


def get_number_of_dblp_records_extended(orcid: str):
    pid = get_dblp_pid_by_orcid(orcid)
    if(pid is None):
        return 0

    records_in_dblp = get_dblp_record_count_by_pid(pid)
    if (records_in_dblp is None):
        return 0
        
    time.sleep(0.1)
    return records_in_dblp


def get_versions_conceptrecid(recid):
    BASE_URL = "https://zenodo.org"

    params = {
        'q': f'conceptrecid:{recid}',
        'all_versions': 'true',
        'exact': 'true',
        'size': 1000
    }

    r = requests.get(BASE_URL + "/api/records/", params=params)

    remaining = int(r.headers['X-RateLimit-Remaining'])
    reset_time_utc = datetime.fromtimestamp(int(r.headers['X-RateLimit-Reset']), timezone.utc)
    print(reset_time_utc)

    if remaining == 0:
        current_time = datetime.now(timezone.utc)
        print(reset_time_utc)
        print(current_time)
        time_to_wait_in_seconds = (reset_time_utc - current_time).total_seconds()
        print(2)
        print(f"Waiting for {time_to_wait_in_seconds} seconds")
        time.sleep(time_to_wait_in_seconds)

    df = r.json()
    concept_records = pandas.json_normalize(df['hits']['hits'])

    return concept_records

In [None]:
## PREPARE DATASETS
unique_records = pandas.json_normalize(unique_hits)
unique_historic_records = pandas.json_normalize(unique_historic_hits)
nonunique_records = pandas.json_normalize(non_unique_hits)


dblp_orcids = pandas.read_fwf('./unique_orcids.txt')
unique_orcid = pandas.read_pickle('./unique_orcids_with_dblp_record_counts.pickle')
versions_of_concepts = pandas.read_pickle('./concept_versions.pickle')

In [None]:
## PREPARE METRICS

# Q: Count of disctinc conceptdoi

count_unique_concept_doi = unique_records['conceptdoi'].nunique()

## Q: Count number of authors per dataset, get min, max, avg, median
unique_records['#creators'] = unique_records['metadata.creators'].apply(lambda x: len(x))

creators_count_max = unique_records['#creators'].max()
creators_count_min = unique_records['#creators'].min()
creators_count_median = unique_records['#creators'].median()
creators_count_mean = unique_records['#creators'].mean()
creators_count_quantile95 = unique_records['#creators'].quantile(q=0.95)

## Versions
unique_records['#versions'] = unique_records['metadata.relations.version'].apply(lambda x: x[0]['count'])

version_count_max = unique_records['#versions'].max()
version_count_min = unique_records['#versions'].min()
version_count_median = unique_records['#versions'].median()
version_count_mean = unique_records['#versions'].mean()
version_count_quantile95 = unique_records['#versions'].quantile(q=0.95)
version_count_exactly_one_version = len(unique_records[unique_records['#versions'] == 1])


## Historic Verisons
unique_historic_records['#versions'] = unique_historic_records['metadata.relations.version'].apply(lambda x: x[0]['count'])

version_historic_count_max = unique_historic_records['#versions'].max()
version_historic_count_min = unique_historic_records['#versions'].min()
version_historic_count_median = unique_historic_records['#versions'].median()
version_historic_count_mean = unique_historic_records['#versions'].mean()
version_historic_count_quantile95 = unique_historic_records['#versions'].quantile(q=0.95)

# Question: do the authors change between versions?
multiple_versions = nonunique_records.groupby('conceptdoi')['doi'].size()
multiple_versions = multiple_versions[multiple_versions > 1]
records_with_multiple_versions = pandas.merge(nonunique_records, multiple_versions, how='inner', on='conceptdoi')
records_with_multiple_versions['#creators'] = records_with_multiple_versions['metadata.creators'].apply(lambda x: len(x))
changing_creator_count = records_with_multiple_versions[['conceptdoi', '#creators']].groupby('conceptdoi').nunique()

multiple_versions_concept_doi_count = records_with_multiple_versions['conceptdoi'].nunique()
multiple_versions_changing_authors_count = changing_creator_count[changing_creator_count['#creators'] > 1].size

## Communities
communmities = unique_records[['metadata.communities']].explode('metadata.communities')
communmities_clean = communmities[communmities['metadata.communities'].notna()]
communmities_clean['metadata.communities'] = communmities_clean['metadata.communities'].apply(lambda x: x['id'])
communmities_agg = communmities_clean.groupby('metadata.communities').agg(count=('metadata.communities', 'count')).sort_values(by=['count'], ascending=False)

top_20_communities = list(communmities_agg.head(20).itertuples(index=True, name=None))
communities_quantile = communmities_agg['count'].quantile(q=0.95)
communities_mean = communmities_agg['count'].mean()
communities_median = communmities_agg['count'].median()
communities_max = communmities_agg['count'].max()
communities_min = communmities_agg['count'].min()


# DBLP authors
doi_signatures_historic = unique_historic_records[['doi', 'metadata.creators']].explode('metadata.creators')
doi_signatures_historic['affiliation'] = doi_signatures_historic['metadata.creators'].apply(lambda x: x.get('affiliation', None))
doi_signatures_historic['name'] = doi_signatures_historic['metadata.creators'].apply(lambda x: x.get('name', None))
doi_signatures_historic['orcid'] = doi_signatures_historic['metadata.creators'].apply(lambda x: x.get('orcid', None))

records_historic_with_orcid_count = doi_signatures_historic.groupby('doi').agg({"orcid": pandas.Series.nunique}).sort_values(by='orcid')
records_historic_dblp_authors = doi_signatures_historic[doi_signatures_historic.orcid.isin(dblp_orcids.orcid)]

count_signatures_hisotric = len(doi_signatures_historic)
count_signatures_with_orcid_hisotric = len(doi_signatures_historic.dropna(subset=['orcid']))
count_records_with_at_least_one_orcid_hisotric = len(records_historic_with_orcid_count[records_historic_with_orcid_count.orcid > 0])
count_records_with_a_dblp_match_hisotric = records_historic_dblp_authors['doi'].nunique()
count_records_with_a_verified_dblp_match_hisotric = doi_signatures_historic[doi_signatures_historic.orcid.isin(unique_orcid[unique_orcid.number_of_rdblp_recods > 0]['orcid'])]['doi'].nunique()

## DBLP authors historic 
doi_signatures = unique_records[['doi', 'metadata.creators']].explode('metadata.creators')
doi_signatures['affiliation'] = doi_signatures['metadata.creators'].apply(lambda x: x.get('affiliation', None))
doi_signatures['name'] = doi_signatures['metadata.creators'].apply(lambda x: x.get('name', None))
doi_signatures['orcid'] = doi_signatures['metadata.creators'].apply(lambda x: x.get('orcid', None))

records_with_orcid_count = doi_signatures.groupby('doi').agg({"orcid": pandas.Series.nunique}).sort_values(by='orcid')
records_dblp_authors = doi_signatures[doi_signatures.orcid.isin(dblp_orcids.orcid)]

count_signatures = len(doi_signatures)
count_signatures_with_orcid = len(doi_signatures.dropna(subset=['orcid']))
count_records_with_at_least_one_orcid = len(records_with_orcid_count[records_with_orcid_count.orcid > 0])
count_records_with_a_dblp_match = records_dblp_authors['doi'].nunique()
count_records_with_a_verified_dblp_match = doi_signatures[doi_signatures.orcid.isin(unique_orcid[unique_orcid.number_of_rdblp_recods > 0]['orcid'])]['doi'].nunique()



# unique_orcid = records_dblp_authors.groupby('orcid').size().reset_index(level=0)
# unique_orcid['number_of_rdblp_recods'] = unique_orcid['orcid'].apply(lambda x: get_number_of_dblp_records(x))
# unique_orcid['number_of_rdblp_recods_extended'] = unique_orcid['orcid'].apply(lambda x: get_number_of_dblp_records_extended(x))

unique_orcid = pandas.read_pickle('./unique_orcids_with_dblp_record_counts.pickle')

count_verified_dblp_orcid = len(unique_orcid[unique_orcid.number_of_rdblp_recods > 0])


## Version Updates

# versions_of_concepts = pandas.DataFrame()
# unique_several_versions = unique_historic_records[unique_historic_records['#versions'] > 1]
# for recid in unique_several_versions['conceptrecid']:
#     df_one = get_versions_conceptrecid(recid)
#     versions_of_concepts = pandas.concat([versions_of_concepts, df_one], ignore_index=True)
# versions_of_concepts.to_pickle('./concept_versions.pickle')

versions_of_concepts['created'] = pandas.to_datetime(versions_of_concepts['created'])
versions_of_concepts = versions_of_concepts.sort_values(by=['created'], ascending=True)
versions_of_concepts['prev'] = versions_of_concepts.groupby('conceptrecid')['created'].shift()

versions_of_concepts['time_delta'] = (versions_of_concepts['created'] - versions_of_concepts['prev']) / pandas.Timedelta(days=1)
concepts_update_time = versions_of_concepts.groupby('conceptrecid').agg(avg_time_delta=('time_delta', 'mean'), count=('time_delta', 'count'))


concept_min_update_time = float(concepts_update_time['avg_time_delta'].min())
concept_max_update_time = float(concepts_update_time['avg_time_delta'].max())
concept_mean_update_time = float(concepts_update_time['avg_time_delta'].mean())
concept_median_update_time = float(concepts_update_time['avg_time_delta'].median())
concept_quantile = float(concepts_update_time['avg_time_delta'].quantile(0.2))

## Export of verified datasets

unique_doi_verified = doi_signatures[doi_signatures.orcid.isin(unique_orcid[unique_orcid.number_of_rdblp_recods > 0]['orcid'])]['doi'].unique()
verified_datasets = unique_records[unique_records['doi'].isin(unique_doi_verified)]
output_verified = verified_datasets[['conceptdoi', 'doi', 'metadata.title', 'stats.views', '#versions', 'links.html', 'links.doi', 'created', 'updated']].sort_values(by=['stats.views'], ascending=False)
output_verified.to_csv('./output_datasets_from_verified_dblp_authors.csv', header=True, index=False)



In [None]:
print(f"In the past  10.000 unique hits there where")
print(f"\t * {count_unique_concept_doi} unique concept dois")
print()
print("Number of Creators")
print(f"\t * min(#creators) = {creators_count_min}")
print(f"\t * max(#creators) = {creators_count_max}")
print(f"\t * median(#creators) = {creators_count_median}")
print(f"\t * mean(#creators) = {creators_count_mean}")
print(f"\t * quantile(0.95, #creators) = {creators_count_quantile95}")
print("Number of Versions")
print(f"\t * min(#versions) = {version_count_min}")
print(f"\t * max(#versions) = {version_count_max}")
print(f"\t * median(#versions) = {version_count_median}")
print(f"\t * mean(#versions) = {version_count_mean}")
print(f"\t * quantile(0.95, #versions) = {version_count_quantile95}")
print(f"\t * count(#versions == 1) = {version_count_exactly_one_version}")
print(f"Number of historic Versions {(unique_historic_records['created'].min(), unique_historic_records['created'].max(), len(unique_historic_records))}")
print(f"\t * min(#versions) = {version_historic_count_min}")
print(f"\t * max(#versions) = {version_historic_count_max}")
print(f"\t * median(#versions) = {version_historic_count_median}")
print(f"\t * mean(#versions) = {version_historic_count_mean}")
print(f"\t * quantile(0.95, #versions) = {version_historic_count_quantile95}")
print("Communities")
print(f"\t * min(size(communities)) = {communities_min}")
print(f"\t * max(size(communities)) = {communities_max}")
print(f"\t * median(size(communities)) = {communities_median}")
print(f"\t * mean(size(communities)) = {communities_mean}")
print(f"\t * quantile(0.95, size(communities)) = {communities_quantile}")
print(f"\t * Top 20 communities: {top_20_communities}")
print()
print("Changing Versions")
print(f"\t * Of the last {len(non_unique_hits)} non  unique updated datasets, there are")
print(f"\t * {multiple_versions_concept_doi_count} datasets that have been updated at least once")
print(f"\t * of them {multiple_versions_changing_authors_count} changed their author count")
print()
print("DBPL authors")
print(f"\t * Number of signatures in dataset = {count_signatures}")
print(f"\t * Number of signatures with orcid = {count_signatures_with_orcid}")
print(f"\t * Number of records with at least one orcid = {count_records_with_at_least_one_orcid}")
print(f"\t * Number of records with a least one dblp author = {count_records_with_a_dblp_match}")
print(f"\t * Number of records with a least one verified dblp author = {count_records_with_a_verified_dblp_match}")
print(f"\t * Number of verified dblp orcids = {count_verified_dblp_orcid}")
print("DBPL authors historic dataset")
print(f"\t * Number of signatures in dataset = {count_signatures_hisotric}")
print(f"\t * Number of signatures with orcid = {count_signatures_with_orcid_hisotric}")
print(f"\t * Number of records with at least one orcid = {count_records_with_at_least_one_orcid_hisotric}")
print(f"\t * Number of records with a least one dblp author = {count_records_with_a_dblp_match_hisotric}")
print(f"\t * Number of records with a least one verified dblp author = {count_records_with_a_verified_dblp_match_hisotric}")
print("Updates to concepts")
print(f"\t * MIN avg_time_between_updates_by_concept in days: {concept_min_update_time}")
print(f"\t * MAX avg_time_between_updates_by_concept in days: {concept_max_update_time}")
print(f"\t * MEAN avg_time_between_updates_by_concept in days: {concept_mean_update_time}")
print(f"\t * MEDIAN avg_time_between_updates_by_concept in days: {concept_median_update_time}")
print(f"\t * QUANTILE 20% avg_time_between_updates_by_concept in days: {concept_quantile}")