In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
data_path = r'C:\Projects\connecteddatahub\data'
grants_path = r'C:\Projects\connecteddatahub\data\grants\university_grants.csv'
interlocks_path = r'C:\Projects\connecteddatahub\data\interlocks\institution_interlocks.csv'


In [3]:
grants_df = pd.read_csv(grants_path)
print(grants_df.columns)

Index(['AffiliationId', 'carnegie_id', 'PrimarySample', 'FullName', 'SystemId',
       'year', 'fullname_norm', 'matched_name', 'num_grants', 'total_funding'],
      dtype='object')


In [4]:
interlocks_df = pd.read_csv(interlocks_path)
interlocks_df = interlocks_df[interlocks_df['Year'].astype(int) >= 2010]
print(interlocks_df.columns)


Index(['Year', 'AffiliationId', 'Institution', 'TotalInterlocks', 'BoardSize'], dtype='object')


In [5]:
grants_df["year"] = grants_df["year"].astype(str)
interlocks_df["Year"] = interlocks_df["Year"].astype(str)

# Merge grant data onto interlocks_df
merged_df = interlocks_df.merge(
    grants_df[["AffiliationId", "year", "num_grants", "total_funding"]],
    how="left",
    left_on=["AffiliationId", "Year"],
    right_on=["AffiliationId", "year"]
)

merged_df = merged_df.drop(columns=["year"])

merged_df["num_grants"] = merged_df["num_grants"].fillna(0).astype(int)
merged_df["total_funding"] = merged_df["total_funding"].fillna(0)

print(merged_df.columns)


Index(['Year', 'AffiliationId', 'Institution', 'TotalInterlocks', 'BoardSize',
       'num_grants', 'total_funding'],
      dtype='object')


In [9]:
interlock_edges_path = r'C:\Projects\connecteddatahub\data\interlocks\yearly_interlocks.csv'
yearly_edges_df = pd.read_csv(interlock_edges_path)
print(yearly_edges_df.columns)
print(yearly_edges_df.head(10))

Index(['Year', 'AffiliationId_1', 'Institution_1', 'AffiliationId_2',
       'Institution_2', 'InterlockCount', 'BoardSize_1', 'BoardSize_2',
       'WeightNorm'],
      dtype='object')
   Year  AffiliationId_1                           Institution_1  \
0  1999             1003              Tennessee Board Of Regents   
1  1999             1003              Tennessee Board Of Regents   
2  1999           392282  State University Of New York At Albany   
3  1999          1174212       University Of Southern California   
4  1999          1174212       University Of Southern California   
5  1999          1629065             Central Michigan University   
6  1999          5388228                 University Of Rochester   
7  1999          5388228                 University Of Rochester   
8  1999          5388228                 University Of Rochester   
9  1999          5388228                 University Of Rochester   

   AffiliationId_2                   Institution_2  InterlockCoun

In [7]:
# import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# import networkx as nx
# import statsmodels.api as sm
# import statsmodels.formula.api as smf

# # =======================================
# # 0. Compute yearly network centrality metrics
# # =======================================
# centrality_rows = []

# for year, sub_edges in yearly_edges_df.groupby("Year"):
#     G = nx.Graph()
#     for _, row in sub_edges.iterrows():
#         G.add_edge(row["AffiliationId_1"], row["AffiliationId_2"], weight=row["WeightNorm"])

#     # Compute metrics
#     degree = nx.degree_centrality(G)
#     weighted_degree = dict(G.degree(weight="WeightNorm"))
#     betw = nx.betweenness_centrality(G, weight="WeightNorm", normalized=True)
#     clust = nx.clustering(G, weight="WeightNorm")

#     for node in G.nodes():
#         centrality_rows.append({
#             "Year": str(year),
#             "AffiliationId": node,
#             "degree_centrality": degree.get(node, 0),
#             "weighted_degree": weighted_degree.get(node, 0),
#             "betweenness_centrality": betw.get(node, 0),
#             "clustering_coeff": clust.get(node, 0),
#         })

# centrality_df = pd.DataFrame(centrality_rows)
# print(f"Centrality metrics computed for {len(centrality_df)} institution-year pairs")

# # Merge metrics into main merged_df
# df = merged_df.merge(centrality_df, on=["Year", "AffiliationId"], how="left")

# # =======================================
# # 1. Data preparation
# # =======================================
# for c in [
#     "TotalInterlocks", "BoardSize", "num_grants", "total_funding",
#     "degree_centrality", "weighted_degree", "betweenness_centrality", "clustering_coeff"
# ]:
#     df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

# df["log_funding"] = np.log1p(df["total_funding"])

# # =======================================
# # 2. Basic correlations
# # =======================================
# print("Spearman correlations:")
# corr = df[[
#     "TotalInterlocks", "BoardSize", "degree_centrality", "weighted_degree",
#     "betweenness_centrality", "clustering_coeff", "num_grants", "log_funding"
# ]].corr(method="spearman")
# print(corr, "\n")

# sns.heatmap(corr, annot=True, cmap="coolwarm")
# plt.title("Spearman correlation matrix (with network metrics)")
# plt.show()

# # =======================================
# # 3. Simple OLS regression (pooled)
# # =======================================
# ols_formula = (
#     "log_funding ~ TotalInterlocks + BoardSize + "
#     "degree_centrality + weighted_degree + "
#     "betweenness_centrality + clustering_coeff"
# )
# ols_model = smf.ols(ols_formula, data=df).fit(cov_type="HC3")
# print(ols_model.summary())

# # =======================================
# # 4. Quantile regression (localized effects)
# # =======================================
# quant_formula = (
#     "log_funding ~ TotalInterlocks + BoardSize + "
#     "degree_centrality + weighted_degree + "
#     "betweenness_centrality + clustering_coeff"
# )
# quant_mod = smf.quantreg(quant_formula, df)

# for q in [0.25, 0.5, 0.75, 0.9]:
#     res = quant_mod.fit(q=q)
#     print(f"\nQuantile {q} results:")
#     print(res.summary())

# # =======================================
# # 5. Visualization: local associations
# # =======================================
# sns.lmplot(
#     x="weighted_degree", y="log_funding",
#     data=df, lowess=True, scatter_kws={"alpha":0.3}, line_kws={"color":"red"}
# )
# plt.title("Local relationship: Weighted Degree vs log(Funding)")
# plt.show()

# sns.lmplot(
#     x="betweenness_centrality", y="log_funding",
#     data=df, lowess=True, scatter_kws={"alpha":0.3}, line_kws={"color":"blue"}
# )
# plt.title("Local relationship: Betweenness Centrality vs log(Funding)")
# plt.show()


In [8]:
# import networkx as nx
# import numpy as np
# import pandas as pd
# from networkx.algorithms import community
# from esda.moran import Moran
# from libpysal.weights import WSP, full2W

# print("=== Local Structure Tests ===\n")

# results = []

# for yr, sub_edges in yearly_edges_df.groupby("Year"):
#     # Build weighted graph for this year
#     G = nx.Graph()
#     for _, r in sub_edges.iterrows():
#         G.add_edge(r["AffiliationId_1"], r["AffiliationId_2"], weight=r["WeightNorm"])

#     # Skip empty years
#     if G.number_of_nodes() < 3:
#         continue

#     # ---------------------------
#     # 1. Community detection
#     # ---------------------------
#     comms = community.greedy_modularity_communities(G, weight="WeightNorm")
#     node_to_comm = {n: i for i, c in enumerate(comms) for n in c}

#     # ---------------------------
#     # 2. Neighbor funding stats
#     # ---------------------------
#     fund_map = merged_df[merged_df["Year"] == str(yr)].set_index("AffiliationId")["total_funding"].to_dict()
#     neighbor_stats = []
#     for n in G.nodes():
#         neighbor_funds = [fund_map.get(v, 0) for v in G.neighbors(n)]
#         mean_neigh_fund = np.mean(neighbor_funds) if neighbor_funds else 0
#         neighbor_stats.append((n, mean_neigh_fund))

#     neigh_df = pd.DataFrame(neighbor_stats, columns=["AffiliationId", "mean_neighbor_funding"])
#     neigh_df["Year"] = str(yr)
#     neigh_df["community_id"] = neigh_df["AffiliationId"].map(node_to_comm)

#     # ---------------------------
#     # 3. Moran’s I test
#     # ---------------------------
#     nodes = list(G.nodes())
#     node_idx = {n: i for i, n in enumerate(nodes)}
#     W_mat = np.zeros((len(nodes), len(nodes)))
#     for u, v, d in G.edges(data=True):
#         i, j = node_idx[u], node_idx[v]
#         W_mat[i, j] = W_mat[j, i] = d.get("weight", 1.0)

#     # Convert adjacency to PySAL weight object
#     try:
#         W_obj = full2W(W_mat)
#     except Exception:
#         # fallback to sparse representation
#         W_obj = WSP(W_mat)

#     funding_vec = np.array([fund_map.get(n, 0) for n in nodes])

#     moran = Moran(funding_vec, W_obj)
#     results.append({
#         "Year": yr,
#         "NumNodes": G.number_of_nodes(),
#         "NumEdges": G.number_of_edges(),
#         "Communities": len(comms),
#         "Moran_I": moran.I,
#         "Moran_p": moran.p_sim
#     })

#     print(f"Year {yr}:")
#     print(f"  Nodes={G.number_of_nodes()}  Edges={G.number_of_edges()}  Communities={len(comms)}")
#     print(f"  Moran’s I = {moran.I:.3f}  (p = {moran.p_sim:.4f})")
#     print("  Example communities:", [len(c) for c in comms[:5]], "\n")

# results_df = pd.DataFrame(results)
# print("\n=== Summary of Moran’s I by year ===")
# print(results_df)
