In [1]:
from tqdm import tqdm
import json
import networkx as nx
from datetime import datetime

G = nx.Graph()
max_citations = -1

# This json file has all the filtered data provided by OpenAlex API
with open("../data/openalex_cs_papers.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collecting author and co-authorship features
for work in tqdm(data["results"]):
    authors = []
    cited_by = work.get("cited_by_count", 0)  
    publication_date_str = work.get("publication_date")
    assert publication_date_str is not None
    
    publication_date = datetime.strptime(publication_date_str, '%Y-%m-%d')

    # Adding/updating attributes for each author
    for author_data in work["authorships"]:
        author_id = author_data["author"]["id"]
        affiliation = (
            author_data["institutions"][0]["display_name"]
            if author_data.get("institutions")
            else "Unknown"
        )
        
        authors.append({"id": author_id, "title": work['title']})

        # Custom attributes for author nodes
        if author_id not in G:
            G.add_node(
                author_id,
                affiliated_institution=affiliation,
                citation_count=cited_by
            )
        else:
            G.nodes[author_id]["citation_count"] += cited_by

        max_citations = max(max_citations, G.nodes[author_id]["citation_count"])

    # Adding co-authorship edges
    for i in range(len(authors)):
        for j in range(i + 1, len(authors)):
            id_1, id_2 = authors[i]["id"], authors[j]["id"]
            if G.has_edge(id_1, id_2):
                G[id_1][id_2]["title"].append(authors[i]["title"])
                G[id_1][id_2]["publication_dates"].append(publication_date)
            else:
                G.add_edge(id_1, id_2)
                G[id_1][id_2]["title"] = [authors[i]["title"]]
                G[id_1][id_2]["publication_dates"] = [publication_date]


100%|██████████| 200/200 [00:00<00:00, 1212.41it/s]


In [2]:
data["results"][0].keys()

dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'type_crossref', 'indexed_in', 'open_access', 'authorships', 'institution_assertions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'fulltext_origin', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'datasets', 'versions', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'abstract_inverted_index_v3', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])

In [4]:
data["results"][0]['topics']

[{'id': 'https://openalex.org/T10206',
  'display_name': 'Meta-analysis and systematic reviews',
  'score': 0.9993,
  'subfield': {'id': 'https://openalex.org/subfields/1804',
   'display_name': 'Statistics, Probability and Uncertainty'},
  'field': {'id': 'https://openalex.org/fields/18',
   'display_name': 'Decision Sciences'},
  'domain': {'id': 'https://openalex.org/domains/2',
   'display_name': 'Social Sciences'}},
 {'id': 'https://openalex.org/T10416',
  'display_name': 'Traumatic Brain Injury Research',
  'score': 0.9832,
  'subfield': {'id': 'https://openalex.org/subfields/2713',
   'display_name': 'Epidemiology'},
  'field': {'id': 'https://openalex.org/fields/27',
   'display_name': 'Medicine'},
  'domain': {'id': 'https://openalex.org/domains/4',
   'display_name': 'Health Sciences'}},
 {'id': 'https://openalex.org/T12443',
  'display_name': 'Delphi Technique in Research',
  'score': 0.9455,
  'subfield': {'id': 'https://openalex.org/subfields/3312',
   'display_name': 'Soc

In [19]:
all_topics = []
all_sub_fields = []
all_fields = []
all_domains = []
for work in data["results"]:
    topics = []
    sub_fields = []
    fields = []
    domains = []
    for topic_dict in work['topics']:
        topics.append(topic_dict['id'])
        sub_fields.append(topic_dict['subfield']['id'])
        fields.append(topic_dict['field']['id'])
        domains.append(topic_dict['domain']['id'])
    all_topics.extend(topics)
    all_sub_fields.extend(sub_fields)
    all_fields.extend(fields)
    all_domains.extend(domains)
print("All topics:")
print(len(all_topics))
print(len(set(all_topics)))

print("All sub-fields:")
print(len(all_sub_fields))
print(len(set(all_sub_fields)))

print("All fields:")
print(len(all_fields))
print(len(set(all_fields)))

print("All domains:")
print(len(all_domains))
print(len(set(all_domains)))


All topics:
525
246
All sub-fields:
525
94
All fields:
525
23
All domains:
525
4


In [5]:
data["results"][0]['keywords']

[{'id': 'https://openalex.org/keywords/guideline',
  'display_name': 'Guideline',
  'score': 0.7005354},
 {'id': 'https://openalex.org/keywords/statement',
  'display_name': 'Statement (logic)',
  'score': 0.6089843},
 {'id': 'https://openalex.org/keywords/presentation',
  'display_name': 'Presentation (obstetrics)',
  'score': 0.5513345}]

In [11]:
all_keywords = []
for work in data["results"]:
    keywords = []
    for keyword in work['keywords']:
        keywords.append(keyword['id'])
    all_keywords.extend(keywords)
print(len(all_keywords))
print(len(set(all_keywords)))

363
225


In [6]:
data['results'][0]['concepts']

[{'id': 'https://openalex.org/C2779356329',
  'wikidata': 'https://www.wikidata.org/wiki/Q922625',
  'display_name': 'Checklist',
  'level': 2,
  'score': 0.8905365},
 {'id': 'https://openalex.org/C189708586',
  'wikidata': 'https://www.wikidata.org/wiki/Q1504425',
  'display_name': 'Systematic review',
  'level': 3,
  'score': 0.84047925},
 {'id': 'https://openalex.org/C2780182762',
  'wikidata': 'https://www.wikidata.org/wiki/Q1630279',
  'display_name': 'Guideline',
  'level': 2,
  'score': 0.7005354},
 {'id': 'https://openalex.org/C2777026412',
  'wikidata': 'https://www.wikidata.org/wiki/Q2684591',
  'display_name': 'Statement (logic)',
  'level': 2,
  'score': 0.6089843},
 {'id': 'https://openalex.org/C547195049',
  'wikidata': 'https://www.wikidata.org/wiki/Q1725664',
  'display_name': 'Terminology',
  'level': 2,
  'score': 0.5986307},
 {'id': 'https://openalex.org/C2777601897',
  'wikidata': 'https://www.wikidata.org/wiki/Q3409113',
  'display_name': 'Presentation (obstetrics)

In [12]:
all_concepts = []
for work in data["results"]:
    concepts = []
    for concept in work['concepts']:
        concepts.append(concept['id'])
    all_concepts.extend(concepts)
print(len(all_concepts))
print(len(set(all_concepts)))

2825
769
