In [None]:
import pandas as pd

nodes_table = pd.read_excel("nodes_undirected.xlsx")
edges_table = pd.read_excel("edges_undirected.xlsx")

# Parse affiliations from semicolon-separated lists in the "Affiliations" column
# Creates a list of institutions for each researcher
nodes_table["affiliation_list"] = nodes_table["Affiliations"].fillna("").apply(
    lambda x: [aff.strip() for aff in x.split(";") if aff.strip()]
)

# Create a dictionary that maps each researcher (by ID) to their list of affiliated institutions
author_affiliations = dict(zip(nodes_table["Id"], nodes_table["affiliation_list"]))

# Define a function to generate all unique institution-to-institution pairs for a given co-authorship
# Self-loops (same institution on both ends) are excluded
def get_institution_pairs(row):
    source_institutions = author_affiliations.get(row["Source"], [])
    target_institutions = author_affiliations.get(row["Target"], [])
    pairs = set()
    for src in source_institutions:
        for tgt in target_institutions:
            if src != tgt:
                pairs.add(tuple(sorted((src, tgt))))
    return list(pairs)

# Apply the pairing function to each edge in the network
# Each co-authorship may produce multiple institution-level collaboration links
edges_table["institution_pairs"] = edges_table.apply(get_institution_pairs, axis=1)

# Explode the list of institution pairs into separate rows (one row per institution-institution pair)
edges_expanded = edges_table.explode("institution_pairs").dropna(subset=["institution_pairs"]).copy()

# Split each institution pair tuple into two separate columns
edges_expanded[["Institution_1", "Institution_2"]] = pd.DataFrame(
    edges_expanded["institution_pairs"].tolist(), index=edges_expanded.index
)

# Group by institution pairs and sum the total weight (number of co-authorships)
institution_collab_edges = (
    edges_expanded.groupby(["Institution_1", "Institution_2"])["Weight"].sum().reset_index()
)

# Save the final institution-to-institution collaboration network to CSV
institution_collab_edges.to_csv("institution_collaborations.csv", index=False)