In [250]:
import pandas as pd
from collections import defaultdict
import networkx as nx
import re
import os
import numpy as np
import matplotlib.pyplot as plt

In [251]:
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
final_scripts = "final_scripts\\"
regression = "regression\\"
normalized_dataframes = "normalized_dataframes\\"
normalized_regression_boards = "normalized_regression_boards\\"
network = "network\\"
network_boards = "network_boards\\"


college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{final_scripts}{regression}{normalized_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"

# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

diversity_statistics_path = f"{altered_dataframe_path}diversity_statistics.csv"
carnegie_map = pd.read_csv(f"{college_matching_path}carnegie_map_openalex.csv")
classification_map = pd.read_csv(f"{college_matching_path}cc_download.csv")
state_path = f"{temporary_data_path}state_systems_validated.csv"   
billionaires_path = f"{temporary_data_path}billionaires_1997_2015.csv"

In [252]:
#time invariant dfs
affiliation_df = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_systems.csv")
original_affiliation_df  = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_original.csv")

#create a dictionary mapping state system -> institutions within the system
affiliation_dict = affiliation_df.groupby("StateSystem")["AffiliationId"].apply(list).to_dict()

affiliation_inverted = {
    aff_id: state_system
    for state_system, aff_ids in affiliation_dict.items()
    for aff_id in aff_ids
}

matched_df = original_affiliation_df[original_affiliation_df["FullName"].isin(affiliation_dict.keys())].copy()

matched_df["StateSystem"] = matched_df["FullName"]

mapping_df = matched_df[["StateSystem", "AffiliationId"]]

mapping_df = mapping_df.drop_duplicates()

system_id_map = {}
nan_id_map = {}

for _, row in mapping_df.iterrows():
    state_system = row["StateSystem"]
    board_aff_id = row["AffiliationId"]
    if pd.isna(board_aff_id):
        nan_id_map[state_system] = affiliation_dict.get(state_system, [])
    else:
        system_id_map[board_aff_id] = affiliation_dict.get(state_system, [])

# Display the resulting dictionary
print(system_id_map)
print(nan_id_map)

# Convert system_id_map values to a list of sets.
# Each value is first cast to int then to str.
system_lists = []
for vals in system_id_map.values():
    sys_set = set(str(int(float(x))).strip() for x in vals if pd.notna(x))
    if sys_set and sys_set not in system_lists:
        system_lists.append(sys_set)
print(system_lists)

print(mapping_df.head)

{127339247.0: [184813773, 67328108, 67328108, 142934699, 142934699, 142934699, 59897056, 59897056, 43369023, 43369023, 26538001, 71838634], 174216632.0: [125687163, 125687163, 125687163], 4210131357.0: [92446798, 92446798, 92446798, 92446798], 4210165361.0: [120156002, 106969075, 155093810], 2801365651.0: [189590672, 146416000, 39587148], 4210141039.0: [61937129, 99041443, 368840534, 44854399], 4210127926.0: [57328836, 24571045, 24571045], 29957033.0: [200885203, 161171246, 161171246], 1327163397.0: [392282, 392282, 392282, 392282, 123946342, 123946342, 123946342, 123946342, 63190737, 63190737, 63190737, 63190737, 59553526, 59553526, 59553526, 59553526, 59553526], 2801649442.0: [8248082, 63772739, 19700959, 106165777, 33213144, 11874761, 2613432], 173268674.0: [75063564, 91045830, 91045830, 91045830, 206651237, 206651237, 96749437, 96749437, 164185940, 164185940, 181414168, 181414168], 2801273398.0: [191429286, 13511017, 13511017, 13511017, 13511017], 4210088475.0: [12315562, 926076166

In [253]:
all_stats = []

for year in years:
    # Construct file paths
    nodes_path = os.path.join(absolute_path, f"{final_scripts}{network}network_interlocks\\{year}_network_interlock_nodes.csv")
    edges_path = os.path.join(absolute_path, f"{final_scripts}{network}network_interlocks\\{year}_network_interlock_edges.csv")
    
    # Read data
    node_df = pd.read_csv(nodes_path)
    edge_df = pd.read_csv(edges_path)
    
    # Create graph from edge list
    G = nx.from_pandas_edgelist(edge_df, source="Source", target="Target", edge_attr="Weight", create_using=nx.Graph())
    
    # Calculate network statistics
    eigen = nx.eigenvector_centrality(G, weight="Weight", max_iter=5000, tol=1e-06)
    betweenness = nx.betweenness_centrality(G, weight="Weight")
    degree_dict = dict(G.degree())
    strength_dict = dict(G.degree(weight="Weight"))
    
    # Map computed statistics back to the node dataframe based on "Id"
    node_df["eigenvector"] = node_df["Id"].map(eigen)
    node_df["betweenness"] = node_df["Id"].map(betweenness)
    node_df["degree"] = node_df["Id"].map(degree_dict)
    node_df["strength"] = node_df["Id"].map(strength_dict)
    
    # Add the current year column
    node_df["Year"] = year
    
    # Select only the desired columns: Id, Institution, network stats, and year
    stats_aff_df = node_df[["Id", "AffiliationId", "eigenvector", "betweenness", "degree", "strength", "Year"]].copy()
    
    # Append the dataframe for this year to our list
    all_stats.append(stats_aff_df)

# Concatenate all yearly stats into one master dataframe
master_df = pd.concat(all_stats, ignore_index=True)

# Save the master dataframe to a CSV file
master_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{network}master_network_stats.csv"), index=False)

In [254]:
combined_systems = {}

# Process numeric keys:
for key, school_ids in system_id_map.items():
    combined_systems[str(key)] = [str(sid) for sid in school_ids]

# Process string keys:
for key, school_ids in nan_id_map.items():
    combined_systems[key] = [str(sid) for sid in school_ids]

print(combined_systems)
expanded_rows = []

# Iterate over each row in master_df.
for idx, row in master_df.iterrows():
    aff_id_str = str(float(row["AffiliationId"]))
    if aff_id_str in combined_systems:
        # Duplicate the row for each affiliated school in the list.
        for new_aff in combined_systems[aff_id_str]:
            new_row = row.copy()
            new_row["AffiliationId"] = new_aff  # Replace with the specific school ID.
            expanded_rows.append(new_row)
    else:
        # If not a match, just add the row as is.
        expanded_rows.append(row)

for idx, row in master_df.iterrows():
    # Convert the Institution value to a string.
    institution_str = str(row["Id"])
    
    # Check if this institution is a key in combined_systems.
    if institution_str in combined_systems:
        # Duplicate the row for each affiliated school in the list.
        for new_aff in combined_systems[institution_str]:
            new_row = row.copy()
            new_row["AffiliationId"] = new_aff  # Replace with the specific school ID.
            expanded_rows.append(new_row)
    else:
        # If not a match, just add the row as is.
        expanded_rows.append(row)

# Create a new DataFrame from the expanded rows.
expanded_master_df = pd.DataFrame(expanded_rows)
expanded_master_df.sort_values(by=["Year", "Id"], inplace=True)
expanded_master_df.drop_duplicates(inplace=True)
expanded_master_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{network}master_network_stats.csv"), index=False)
# Optionally, view the result.


{'127339247.0': ['184813773', '67328108', '67328108', '142934699', '142934699', '142934699', '59897056', '59897056', '43369023', '43369023', '26538001', '71838634'], '174216632.0': ['125687163', '125687163', '125687163'], '4210131357.0': ['92446798', '92446798', '92446798', '92446798'], '4210165361.0': ['120156002', '106969075', '155093810'], '2801365651.0': ['189590672', '146416000', '39587148'], '4210141039.0': ['61937129', '99041443', '368840534', '44854399'], '4210127926.0': ['57328836', '24571045', '24571045'], '29957033.0': ['200885203', '161171246', '161171246'], '1327163397.0': ['392282', '392282', '392282', '392282', '123946342', '123946342', '123946342', '123946342', '63190737', '63190737', '63190737', '63190737', '59553526', '59553526', '59553526', '59553526', '59553526'], '2801649442.0': ['8248082', '63772739', '19700959', '106165777', '33213144', '11874761', '2613432'], '173268674.0': ['75063564', '91045830', '91045830', '91045830', '206651237', '206651237', '96749437', '9

In [255]:
expanded_master_df.rename(columns={"Id": "Institution"}, inplace=True)
original_affiliation_df.rename(columns={"FullName": "Institution"}, inplace = True)

expanded_master_df["AffiliationId"] = expanded_master_df["AffiliationId"].astype(float)

expanded_master_df = expanded_master_df.merge(
    original_affiliation_df[['AffiliationId', 'Institution']], 
    on='AffiliationId', 
    how='left', 
    suffixes=('', '_new')
)

# Overwrite the Institution column in expanded_master_df with the matched values
expanded_master_df['Institution'] = expanded_master_df['Institution_new']

# Drop the temporary column used for merging
expanded_master_df.drop(columns=['Institution_new'], inplace=True)

expanded_master_df.sort_values(by=["Year", "Institution"], inplace=True)
expanded_master_df.drop_duplicates(inplace=True)
expanded_master_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{network}master_network_stats.csv"), index=False)
# Optionally, view the result.


In [256]:
import pandas as pd

# Load the data
university_boards_statistics_path = f"{absolute_path}{final_scripts}{regression}regression_stats\\regression_university_board_statistics.csv"
board_statistics_df = pd.read_csv(university_boards_statistics_path)

# Ensure correct data types for merging
board_statistics_df["Year"] = board_statistics_df["Year"].astype(int)
board_statistics_df["AffiliationId"] = board_statistics_df["AffiliationId"].astype(float)

expanded_master_df["Year"] = expanded_master_df["Year"].astype(int)
expanded_master_df["AffiliationId"] = expanded_master_df["AffiliationId"].astype(float)

# Select relevant columns from expanded_master_df
columns_to_merge = ["Year", "AffiliationId", "eigenvector", "betweenness", "degree", "strength"]
expanded_subset = expanded_master_df[columns_to_merge]

# Merge based on "Year" and "AffiliationId"
board_statistics_df = board_statistics_df.merge(
    expanded_subset,
    on=["Year", "AffiliationId"],
    how="left",
    suffixes=("_old", "")  # Ensures new values do not get "_y" suffix
)

# Drop old columns (eigenvector_x, betweenness_x, degree_x, strength_x)
board_statistics_df.drop(columns=["eigenvector_old", "betweenness_old", "degree_old", "strength_old"], errors="ignore", inplace=True)

# Fill NaN values in the newly merged columns
board_statistics_df[["eigenvector", "betweenness", "degree", "strength"]] = board_statistics_df[["eigenvector", "betweenness", "degree", "strength"]].fillna(0)

# Save the updated DataFrame
board_statistics_df.to_csv(university_boards_statistics_path, index=False)
