In [1]:
import pandas as pd
import networkx as nx
import json
import re
from collections import Counter

In [2]:
df = pd.read_csv('data_scopus.csv')

In [3]:
# ---------- 2. Build affiliation mapping ----------
affiliation_map = {}
for aff_text in df['Authors with affiliations'].dropna():
    parts = [a.strip() for a in aff_text.split(';') if a.strip()]
    for part in parts:
        match = re.match(r"([^,]+),\s*(.*)", part)
        if match:
            name, aff = match.groups()
            name = re.sub(r'[\s]+$', '', name.strip())
            affiliation_map[name.replace(',', '')] = aff

# Helper: extract country from affiliation
def extract_country(aff):
    if not isinstance(aff, str):
        return None
    country_match = re.findall(r',\s*([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)$', aff.strip())
    if country_match:
        return country_match[-1]
    return None

In [4]:
# ---------- 3. Build graph ----------
G = nx.Graph()

for _, row in df.iterrows():
    authors = [a.strip() for a in str(row['Authors']).split(',') if a.strip()]
    title = row['Title']

    for author in authors:
        aff_text = None
        for key in affiliation_map.keys():
            if author.split()[0] in key:
                aff_text = affiliation_map[key]
                break
        country = extract_country(aff_text) or "Unknown"

        if not G.has_node(author):
            G.add_node(author, affiliation=aff_text, country=country, papers=set())
        G.nodes[author]['papers'].add(title)

    # add co-author edges
    for i in range(len(authors)):
        for j in range(i + 1, len(authors)):
            if G.has_edge(authors[i], authors[j]):
                G[authors[i]][authors[j]]['weight'] += 1
            else:
                G.add_edge(authors[i], authors[j], weight=1)

In [5]:
# ---------- 4. Top-10 countries ----------
countries = [G.nodes[n]['country'] for n in G.nodes if G.nodes[n]['country'] != "Unknown"]
top10 = set([c for c, _ in Counter(countries).most_common(10)])

In [6]:
# ---------- 5. Build JSON structure ----------
nodes = []
for n, data in G.nodes(data=True):
    nodes.append({
        "id": n,
        "country": data.get("country", "Unknown"),
        "affiliation": data.get("affiliation", ""),
        "papers": list(data.get("papers", [])),
        "degree": int(G.degree(n)),
        "isTopCountry": data.get("country") in top10
    })

links = []
for u, v, d in G.edges(data=True):
    links.append({
        "source": u,
        "target": v,
        "weight": d.get("weight", 1)
    })

graph_json = {"nodes": nodes, "links": links}

# ---------- 6. Write to JSON file ----------
with open("data.json", "w", encoding="utf-8") as f:
    json.dump(graph_json, f, indent=2, ensure_ascii=False)