https://novelpy.readthedocs.io/en/latest/indicators.html#wu-et-al-2019-bornmann-et-al-2019-bu-et-al-2019

---

"To increase the validity of the indicators included in this study, we considered only papers with at least 10 cited references and at least 10 citations. "

Bornmann, L., Devarakonda, S., Tekles, A., & Chacko, G. (2020). Are disruption index indicators convergently valid? The comparison of several indicator variants with assessments by peers. Quantitative Science Studies, 1(3), 1242–1259. https://doi.org/10.1162/qss_a_00068

---

Bu, Y., Waltman, L., & Huang, Y. (2021). A multidimensional framework for characterizing the citation impact of scientific publications. Quantitative Science Studies, 2(1), 155–183. https://doi.org/10.1162/qss_a_00109

di_nok_1 is highly correlated (>.8) with di_nok_5, di_5

---

Wu & Yan (2019) and Wu & Wu (2019) argue against including citations to the focal work's references ("prelude citations") and conclude that "four indicators (SC, SC-DC, SC/(SC+DC), and (SC-DC)/(SC+DC)) are logically and empirically reasonable." We use here (SC-DC)/(SC+DC).

Wu, S., & Wu, Q. (2019). A confusing definition of disruption. SocArXiv. https://doi.org/10.31235/osf.io/d3wpk

Wu, Q., & Yan, Z. (2019). Solo citations, duet citations, and prelude citations: New measures of the disruption of academic papers (arXiv:1905.03461). arXiv. https://doi.org/10.48550/arXiv.1905.03461


## Setup and imports

In [1]:
%pip install pyarrow

distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
distutils: /opt/conda/include/python3.8/UNKNOWN
sysconfig: /opt/conda/include/python3.8[0m
user = False
home = None
root = None
prefix = None[0m
Note: you may need to restart the kernel to use updated packages.


In [116]:
from collections import defaultdict
import math
import json
from pathlib import Path
import pickle
import time

from IPython.display import display, clear_output
import numpy as np
import pandas as pd
import requests

%matplotlib inline

DATA_DIR = Path('./data')

# input
RAW_COLLATED_PATH = DATA_DIR / 'raw_collated.json'
# output
DISR_DF_PATH = DATA_DIR / 'disruption_df.feather'

PER_PAGE = 200 # API max
# not using this -- but should we limit cites to type:journal-article ? 
# SHARED_FILTERS = "type:journal-article,publication_year:>2010,publication_year:<2018" 

In [3]:

# CORE_DF_PATH = DATA_DIR / 'core_df.feather'

In [4]:
# core_df = pd.read_feather(CORE_DF_PATH)
# core_df.sort_values('cited_by_count', ascending=False).head(200)

In [104]:
PER_PAGE = 200

def retrieve_all_openalex_records(base_url, delay=1, max_pages=1000):
    """Don't send per_page and page, we'll handle pagination"""
    assert "&page=" not in base_url
    
    half_delay = delay/2
    
    records = []
    for page in range(1, (max_pages + 1)):
        pagination = f"per_page={PER_PAGE}&page={page}"
        url = "&".join((base_url, pagination))
        time.sleep(half_delay) 
        got = requests.get(url)
        
        try:
            j = got.json()
        except json.JSONDecodeError as err:
            print('oop made it angry')
            print(url)
            print('and we got')
            print(got)
            raise err
            
        try:
            print(j['meta'])
        except KeyError as err:
            print('oop made it angry')
            print(url)
            print('and we got')
            print(got)
            raise err

        if page == 1:
            print(url)

        records += j['results']

        count = j['meta']['count']
        page = j['meta']['page']
        per_page = j['meta']['per_page']
        time.sleep(half_delay) 
        if per_page * page >= count:
            break
    return records

In [84]:
# N_YEARS_CITATIONS = 5
# f",publication_year:<{too_late}" ## so.... turns out cited url only returns 30. just... 30.
print([i for i in range(1, 11)])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
with RAW_COLLATED_PATH.open('r', encoding='UTF-8') as infile:
    raw_records = json.load(infile)
# so all I need is the name of the work and its refs
focal_works = {
    r['id'].replace('https://openalex.org/', ''): set(x.replace('https://openalex.org/', '') for x in r['referenced_works'])
    for r in raw_records
}
all_refs_of_focal_works = set.union(*focal_works.values())
len(all_refs_of_focal_works)
print(f'{len(focal_works):,} works and {len(all_refs_of_focal_works):,} references')
# free up some memory!
del raw_records

In [119]:
# this can be cached, as the raw records won't be needed again
known_refs = {fw: list(refs) for fw, refs in focal_works.items()}
print(len(known_refs))

113937


In [40]:
# How many have at least 10 references?
len([k for k, v in focal_works.items() if len(v) >= 10])

88827

In [8]:
_ = """

cites
Value: the OpenAlex ID for a given work
Returns: works that cite the given work. You can think of this as incoming citations. 
Get works that cite https://openalex.org/W2741809807: https://api.openalex.org/works?filter%3Dcites%3AW2741809807

cited_by
Value: the OpenAlex ID for a given work
Returns: works found in the given work's referenced_work section. You can think of this as outgoing citations. 
Get works cited by https://openalex.org/W2766808518: https://api.openalex.org/works?filter%3Dcited_by%3AW2766808518

"""


In [9]:
DISR_DF_PATH

PosixPath('data/disruption_df.feather')

In [118]:
# # original creation -- rather important not to accidentally overwrite this!
# disr_df = pd.DataFrame(index=focal_works.keys(), columns='di_nok_1 focal_refs len_I len_J fetched'.split())
# disr_df.index.rename('id', inplace=True)
# disr_df['fetched'].fillna(value=False, inplace=True)
# # have to bounce the index to a column to feather out
# disr_df.reset_index().to_feather(DISR_DF_PATH)

In [None]:
# by chunking our job 50 ids at a time,
# we only need 2,279 (about 38 minutes at 1 second per) queries...
# it actually takes about 3x that because the average 50-id request returns 400-600 refs
# but still! 17x faster at 3x... call it around 2h raw API time
# rather than the full 113,937 (31h 39m) or even the reduced 88k (24h 27m)

In [117]:
disr_df = pd.read_feather(DISR_DF_PATH).set_index('id')

# by keeping this sorted, we can keep track of what we have or haven't visited
ids = sorted(list(pd.read_feather(DISR_DF_PATH)['id']))
# print(ids[:10])

# I can do 50 at a time using filter on ID and intersection e.g.
# https://api.openalex.org/works?mailto=matvan@umich.edu&filter=cites:W2737572559|W2737572559|W3041843825&per_page=200&page=1
batch_size = 50 
delay = 1

for i in range(0, len(ids), batch_size):
    clear_output(wait=True)
    print(len(disr_df[disr_df.fetched == True]), 'items of', len(disr_df), 'handled')
    if len(disr_df[disr_df.fetched == True]) == len(disr_df):
        break
    print('i =', i)
    raw_id_batch = ids[i:i+batch_size]
    id_batch = [x for x in raw_id_batch if not disr_df.loc[x].fetched]
    if not id_batch:
        continue
    print('batch =', id_batch)
    mini_refs_network = {x: known_refs[x] for x in id_batch}
    # TODO next time -- type=journal-article
    cited_url = f"https://api.openalex.org/works?mailto=matvan@umich.edu&filter=cites:" + '|'.join(id_batch)
    records_batch = retrieve_all_openalex_records(cited_url, delay=delay, max_pages=50)
    print('Adding', len(records_batch))
    
    for rec in records_batch:
        r_id = rec['id'].replace('https://openalex.org/', '')
        mini_refs_network[r_id] = [x.replace('https://openalex.org/', '') for x in rec['referenced_works']]

    # next we'll invert the dictionary
    # so we go from list of papers: list of refs to list of refs: list of papers
    print('mini_refs_network', len(mini_refs_network))
    print('citee_to_citer', len(citee_to_citer))

    citee_to_citer = defaultdict(list)
    for paper, refs in mini_refs_network.items():
    #     print(paper)
        for ref in refs:
            citee_to_citer[ref].append(paper)

    for j, focal_id in enumerate(id_batch):

        print(focal_id, end='... ')

        focal_refs = set(mini_refs_network[focal_id])
        citing_works = {k: set(mini_refs_network[k]) for k in citee_to_citer[focal_id]}

        # papers that cite the focal paper that also cite reference from the focal paper
        # novelpy J = set(citing_focal_paper.keys()).intersection(citing_ref_from_focal_paper.keys())
        J = set(cw for cw, cw_refs in citing_works.items() if cw_refs & focal_refs)

        J_n_dict = {cw: len(cw_refs & focal_refs) for cw, cw_refs in citing_works.items() if (cw_refs & focal_refs)}

        # papers that cite the focal paper but do not cite reference from the focal paper
        # novelpy I = set(citing_focal_paper.keys()) - J

        I = set(citing_works.keys()) - J
        len_I = len(I)
        len_J = len(J)

        #         print('J', len(J), J)
        #         print('J_n_dict', len(J_n_dict), J_n_dict)
        #         print('I', len(I), I)

        # 0 case (i.e. literally no citations to the focal work) handled earlier
        # Rather than set to 0, better I think to assert that works with no citations 
        # cannot have their developmental/disruptive influence calculated.

        try:
            di_nok_1 = (len_I-len_J)/(len_I+len_J) 
        except ZeroDivisionError:
            # oy, got no citations here
            di_nok_1 = None


        row = [di_nok_1, len(focal_refs), len_I, len_J, True]
        print(row)
        disr_df.loc[focal_id] = row


    print('Writing df to', DISR_DF_PATH)
    disr_df.reset_index().to_feather(DISR_DF_PATH)

113937 items of 113937 handled


In [115]:
disr_df.describe()

Unnamed: 0,di_nok_1,focal_refs,len_I,len_J
count,97338.0,113937.0,113782.0,113782.0
mean,-0.209524,31.629585,7.031692,14.223436
std,0.621898,31.869399,39.775831,38.743563
min,-1.0,0.0,0.0,0.0
25%,-0.692308,12.0,1.0,1.0
50%,-0.333333,27.0,3.0,5.0
75%,0.142857,43.0,7.0,15.0
max,1.0,1355.0,8002.0,3834.0
