In [1]:
"""
This file extracts the meta data on author rates and switching from the Arxiv dataset on Kaggle. 

NOTE: you also have to get the ArXiv data from online. I did this here:
https://www.kaggle.com/datasets/Cornell-University/arxiv, and I placed the unzipped file
"arxiv-metadata-oai-snapshot.json" (around 4 GB) inside the folder "arxiv_data".
"""

import numpy as np
import pandas as pd
import math
import itertools

import gc
import os
import json
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import plotly.graph_objs as go
import kaleido

from pathlib import Path
root = Path(".")
data_dir = root / "arxiv_data"  # you will have to create this

import plotly.express as px
import re
year_pattern = r'([1-2][0-9]{3})'

In [2]:
# Some basic functions
from basic_utils import *

def get_metadata():
    with open(data_dir / 'arxiv-metadata-oai-snapshot.json', 'r') as f:
        for line in f:
            yield line

### Extract data by publication

In [3]:
# Create a pandas dataframe with each publication information 

# Create an empty list to accumulate rows
rows = []

length = 0
metadata = get_metadata()
for paper in metadata:
    if length > MAX_EXTRACTION_LENGTH:
        break
    paper_data = json.loads(paper)
    
    # Extract and process the data
    categories = list(set([check_in_major_categories(cat) for cat in paper_data['categories'].split() 
                  if check_in_major_categories(cat) is not None])) # removes duplicates
    if len(categories) == 0:
        continue  # skip this one, it's a weird paper with no normal tags 
    paper_id = paper_data['id']
    pub_date = extract_decimal_year_of_pub(paper_data['versions'])
    authors_parsed = paper_data['authors_parsed']
    
    # Create a dictionary with the extracted data
    row = {'id': paper_id, 'categories': categories, 'pub_date': pub_date, 'authors_parsed': authors_parsed}

    # Append the row to the rows list
    rows.append(row)
    
    if length % 100_000 == 0:
        print(length, 'done')
    length += 1

# Create a DataFrame using the accumulated rows
columns = ['id', 'categories', 'pub_date', 'authors_parsed']
df = pd.DataFrame(rows, columns=columns)

print(len(df))

0 done
100000 done
200000 done
300000 done
400000 done
500000 done
600000 done
700000 done
800000 done
900000 done
1000000 done
1100000 done
1200000 done
1300000 done
1400000 done
1500000 done
1600000 done
1700000 done
1800000 done
1900000 done
2000000 done
2100000 done
2200000 done
2208915


### Get unique categories and mapping

In [4]:

# Get a list of all unique categories
unique_categories = sorted(set(cat for categories in df['categories'] for cat in categories))

def index_to_category(ind):
    return unique_categories[ind]

def category_to_index(cat):
    return unique_categories.index(cat)

print(unique_categories)

['astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn', 'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.other', 'cond-mat.quant-gas', 'cond-mat.soft', 'cond-mat.stat-mech', 'cond-mat.str-el', 'cond-mat.supr-con', 'cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY', 'econ.EM', 'eess.AS', 'eess.IV', 'eess.SP', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT', 'math.HO', 'math.IT', 'math.KT', 'math.LO', 'math.MG', 'math.MP', 'math.NA', 'math.NT',

### Extract data by author

In [5]:
# Create a dictionary with the contributions of each unique author name. 

# Initialize an empty dictionary to store the authors and their papers
author_papers = {}

# Iterate through the rows of the DataFrame
for index, row in df.iterrows():
    
    paper_id = row['id']
    categories = row['categories']
    pub_date = row['pub_date']
    authors_parsed = row['authors_parsed']

    # Iterate through the authors of each paper
    for author in authors_parsed:
        author_key = "".join(author)  # Create a unique author key

        # If the author is not in the dictionary, add them with their first paper
        if author_key not in author_papers:
            author_papers[author_key] = []

        # Add the paper information to the author's list of papers
        author_papers[author_key].append([paper_id, pub_date, categories])

print(len(author_papers))


1762689


### Extract author status data by date 

In [6]:
"""
# Initialize the author_status_metadata 
# Contains a list of categories currently being published by each author during each timebin
# as well as the previous author's time bin

# An author is only considered active in years where they publish something. 
    
"""

author_status_metadata = defaultdict(list)

for author_key, papers in author_papers.items():
    if not (MIN_PUBLICATIONS <= len(papers) <= MAX_PUBLICATIONS):
        continue  # only use authors with a few publications, or not too many (otherwise likely a non-unique name)

    bin_size = YEARS_BINSIZE
    binned_papers = defaultdict(set)

    # Bin the papers by date
    for p in papers:
        _, date, categories_as_words = p
        binned_date = bin_date(date, bin_size)
        categories_as_indices = set([category_to_index(item) for item in categories_as_words])
        binned_papers[binned_date].update(categories_as_indices)

    # Find the min and max binned dates
    min_binned_date = bin_date(min(date for date in binned_papers.keys()), bin_size)
    max_binned_date = bin_date(max(date for date in binned_papers.keys()), bin_size)

    # Iterate over the binned dates in the known range
    current_date = min_binned_date
    prev_year_categories = set(binned_papers[min_binned_date])
    while current_date <= max_binned_date + 0.5 * bin_size:  # add 0.5 to not miss last bin
        if binned_papers[current_date]:
            current_year_categories = set(binned_papers[current_date])
            author_status_metadata[current_date].append([list(current_year_categories), list(prev_year_categories)])
        
        prev_year_categories = current_year_categories
        current_date = bin_date(current_date + 1.01*bin_size, bin_size)  # add 1.01 to make sure to reach next bin
        
sorted_dates = list(sorted(author_status_metadata.keys()))
print(len(sorted_dates))

37


In [7]:
# Now for each bin, we can take the list of papers, and measure things. 

def count_authors_by_category(author_status_data, unique_categories):
    category_count = np.zeros(len(unique_categories))

    for author in author_status_data:
        current_categories = author[0]
        weight = 1 / len(current_categories)
        for category in current_categories:
            category_count[category] += weight

    return category_count

def count_transitions(author_status_data, unique_categories):    
    """
    Gets the transition matrix. first index is source, second index is target
    The entry is the number of transitions from one field to another. 
    """
    
    transition_count = np.zeros((len(unique_categories), len(unique_categories)))
    
    for author in author_status_data:
        current_categories = author[0]
        previous_categories = author[1]

        transition_vector = np.zeros(len(unique_categories))

        for cat in current_categories:
            transition_vector[cat] += 1 / len(current_categories)        
        for cat in previous_categories:
            transition_vector[cat] -= 1 / len(previous_categories)

        neg_source_mask = transition_vector[previous_categories] < 0
        pos_target_mask = transition_vector[current_categories] > 0

        for source_index, source_neg in zip(previous_categories, neg_source_mask):
            if not source_neg:
                continue

            ratio = (-transition_vector[source_index]) / np.sum(-transition_vector[transition_vector < 0])
            for target_index, target_pos in zip(current_categories, pos_target_mask):
                if target_pos:
                    transition_count[source_index, target_index] += ratio * transition_vector[target_index]

    return transition_count

author_counts = {date: count_authors_by_category(author_status_metadata[date], unique_categories) for date in sorted_dates}
author_transitions = {date: count_transitions(author_status_metadata[date], unique_categories) for date in sorted_dates}
print('done')

done


## Save the data to json for reloading

In [8]:
# To save to JSON we need to convert numpy arrays to lists and back and forth

# Save the dictionary to a JSON file
with open('author_counts.json', 'w') as f:
    convert_dict_of_arrays_to_dict_of_lists(author_counts)
    json.dump(author_counts, f)
with open('author_transitions.json', 'w') as f:
    convert_dict_of_arrays_to_dict_of_lists(author_transitions)
    json.dump(author_transitions, f)
with open('unique_categories.json', 'w') as f:
    json.dump(unique_categories, f)
with open('sorted_dates.json', 'w') as f:
    json.dump(sorted_dates, f)

## Way to reload the data in another file

In [9]:
# Load the dictionary from the JSON file, as a dictionary of numpy arrays
with open('author_counts.json', 'r') as f:
    author_counts = json.load(f)
    convert_dict_of_lists_to_dict_of_arrays(author_counts)
with open('author_transitions.json', 'r') as f:
    author_transitions = json.load(f)
    convert_dict_of_lists_to_dict_of_arrays(author_transitions)
# Load the labels 
with open('unique_categories.json', 'r') as f:
    unique_categories = json.load(f)
with open('sorted_dates.json', 'r') as f:
    sorted_dates = json.load(f)