In [25]:
import networkx as nx
import pandas as pd
import os
import numpy as np
from itertools import combinations
from scipy.stats import rankdata
import matplotlib.pyplot as plt

In [26]:
leadership_path = r'C:\Projects\connecteddatahub\data\cleaned_dataframes\master_leadershipdata_validated.csv'
data_path = r'C:\Projects\connecteddatahub\data'
leadership_df = pd.read_csv(leadership_path)
print(leadership_df.columns)

Index(['Year', 'Name', 'Position', 'Institution', 'SubInstitution',
       'Education', 'FixedPosition', 'Seniority', 'Designation',
       'AffiliationId', 'carnegie_id', 'SystemId', 'RawName', 'RawInstitution',
       'RawPosition', 'FirstName', 'MiddleName', 'LastName', 'NickName',
       'SuffixName', 'PrefixName', 'MiddleInitials', 'RawName4Match',
       'LeaderId'],
      dtype='object')


In [27]:
board_df =  leadership_df[leadership_df['FixedPosition'] == 'Board Member']
print(board_df.shape)

(54697, 24)


In [28]:
from itertools import combinations
import pandas as pd
import numpy as np

def build_interlock_graphs(board_df):
    """
    Build a PERSON-PERSON interlock network from board membership data.

    Nodes = people (FirstName + LastName).
    Edges = undirected ties between people who serve on the same board
            (same Year & AffiliationId/Institution).

    Each edge keeps the institutions/affiliation IDs where that pair co-served,
    so you can connect back to schools and study whether highly connected
    individuals matter for outcomes.

    Parameters
    ----------
    board_df : pd.DataFrame
        Expected columns (min):
        ['Year', 'FirstName', 'LastName', 'AffiliationId', 'Institution', 'SystemId']

    Returns
    -------
    edges_df : pd.DataFrame
        Columns:
          - Year
          - Person_1, Person_2        (canonicalized person IDs)
          - Institutions              (list of institutions where they co-served that year)
          - AffiliationIds            (list of affiliation IDs for those boards)
          - SystemIds                 (list of system IDs, if present)
          - CoServiceCount            (# distinct boards / affiliations they share that year)

    nodes_df : pd.DataFrame
        Columns:
          - Person                    (unique person identifier: "First Last")
    """

    df = board_df.copy()

    # --- Clean and define person identifier ---
    df["FirstName"] = df["FirstName"].fillna("").astype(str).str.strip()
    df["LastName"] = df["LastName"].fillna("").astype(str).str.strip()
    df["Person"] = (df["FirstName"] + " " + df["LastName"]).str.strip()

    # Drop rows without a usable person name
    df = df[df["Person"] != ""].copy()

    interlocks = []

    # For each board: people on same (Year, AffiliationId, Institution) co-serve
    group_cols = ["Year", "AffiliationId", "Institution", "SystemId"]
    for (year, affil, inst, sysid), g in df.groupby(group_cols):
        people = g["Person"].drop_duplicates().tolist()
        if len(people) <= 1:
            continue

        # All unordered pairs of people on this board
        for p1, p2 in combinations(sorted(people), 2):
            if p1 == p2:
                continue

            interlocks.append({
                "Year": year,
                "Person_1": p1,
                "Person_2": p2,
                "Institution": inst,
                "AffiliationId": affil,
                "SystemId": sysid
            })

    if not interlocks:
        # No edges: return empty, but structurally consistent
        edges_df = pd.DataFrame(
            columns=[
                "Year", "Person_1", "Person_2",
                "Institutions", "AffiliationIds", "SystemIds",
                "CoServiceCount"
            ]
        )
        nodes_df = pd.DataFrame(columns=["Person"])
        return edges_df, nodes_df

    edges_raw = pd.DataFrame(interlocks)

    # --- Aggregate to one edge per (Year, Person_1, Person_2) ---
    edges_df = (
        edges_raw
        .groupby(["Year", "Person_1", "Person_2"], as_index=False)
        .agg({
            # all institutions where this pair co-served in that year
            "Institution": lambda x: sorted(set(x)),
            "AffiliationId": lambda x: sorted(set(x)),
            "SystemId": lambda x: sorted(set(v for v in x if pd.notna(v)))
        })
        .rename(columns={
            "Institution": "Institutions",
            "AffiliationId": "AffiliationIds",
            "SystemId": "SystemIds"
        })
    )

    # Number of distinct boards/affiliations they share that year
    # (can interpret as tie strength)
    edges_df["CoServiceCount"] = edges_df["AffiliationIds"].apply(len)

    # --- Nodes: unique people ---
    nodes_df = (
        pd.DataFrame({"Person": pd.unique(
            pd.concat([edges_df["Person_1"], edges_df["Person_2"]], ignore_index=True)
        )})
        .sort_values("Person")
        .reset_index(drop=True)
    )

    return edges_df, nodes_df


In [29]:
edges_df, nodes_df = build_interlock_graphs(board_df)

edges_df = edges_df.assign(
    Source=edges_df["Person_1"],
    Target=edges_df["Person_2"],
)


In [30]:
edges_df.to_csv(os.path.join(data_path, 'interlocks', 'person_interlock_edges.csv'), index=False)
nodes_df.to_csv(os.path.join(data_path, 'interlocks', 'person_interlock_nodes.csv'), index=False)