In [492]:
import pandas as pd
from collections import defaultdict, Counter
import networkx as nx
import os
import re
import matplotlib.pyplot as plt
from rapidfuzz import fuzz
from itertools import combinations

In [493]:
absolute_path = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"  
scripts = "scripts\\"
board_dataframes = "board_dataframes\\"
yearly_interlocks = "yearly_interlocks\\"
final_scripts = "final_scripts\\"
normalized_dataframes = "normalized_dataframes\\"


years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

In [494]:
def remove_non_samples(df: pd.DataFrame) -> pd.DataFrame:
    """Filter the DataFrame to include only rows where 'PrimarySample' is True."""
    return df[df['PrimarySample'] == True]

def normalize(x):
    """Convert the input to a string and strip whitespace."""
    return str(x).strip()
    


In [495]:
#year invariant dataframes
affiliation_path = os.path.join(absolute_path, final_scripts, 'normalized_dataframes\\affiliation_altered.csv')
state_system_path = os.path.join(absolute_path, final_scripts, 'normalized_dataframes\\state_systems_validated.csv')

affiliation_df = pd.read_csv(affiliation_path)
state_systems_df = pd.read_csv(state_system_path)

affiliation_df["AffiliationId"] = affiliation_df["AffiliationId"].astype(str)
affiliation_df = affiliation_df.drop_duplicates(subset=['AffiliationId'])

state_systems_df["AffiliationId"] = state_systems_df["AffiliationId"].astype(str)
state_systems_df = state_systems_df.drop_duplicates(subset=['AffiliationId'])

In [496]:
for year in years:
    board_path = os.path.join(absolute_path, board_dataframes, f"{year}_boards.csv")
    double_board_path = os.path.join(absolute_path, board_dataframes, f"{year}_double_board.csv")
    
    # Read the board data
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    # board_df = remove_non_samples(board_df)
    # double_board_df = remove_non_samples(double_board_df)

    board_df["AffiliationId"] = board_df["AffiliationId"].astype(str)
    double_board_df["AffiliationId"] = double_board_df["AffiliationId"].astype(str)
    double_board_df['StateSystem'] = ''

    board_df['AffiliationId'] = board_df['AffiliationId'].fillna('0')
    double_board_df['AffiliationId'] = double_board_df['AffiliationId'].fillna('0')

    # Merge affiliation_df's PrimarySample into board_df.
    # Use suffixes so that board_df's original column is preserved.
    merged_df = board_df.merge(
        affiliation_df[['AffiliationId', 'PrimarySample']],
        on='AffiliationId',
        how='left',
        suffixes=('_board', '')
    )
    merged_df['PrimarySample'] = merged_df['PrimarySample'].combine_first(merged_df['PrimarySample_board'])
    merged_df.drop('PrimarySample_board', axis=1, inplace=True)

    board_df = merged_df
    board_df = board_df.drop_duplicates()

    merged_df = board_df.merge(
        state_systems_df[['AffiliationId', 'StateSystem']],
        on='AffiliationId',
        how='left',
        suffixes=('_board', '')
    )
    merged_df['StateSystem'] = merged_df['StateSystem'].combine_first(merged_df['StateSystem_board'])
    merged_df.drop('StateSystem_board', axis=1, inplace=True)

    board_df = merged_df
    board_df = board_df.drop_duplicates()

    
    

    merged_df = double_board_df.merge(
        affiliation_df[['AffiliationId', 'PrimarySample']],
        on='AffiliationId',
        how='left',
        suffixes=('_board', '')
    )
    merged_df['PrimarySample'] = merged_df['PrimarySample'].combine_first(merged_df['PrimarySample_board'])
    merged_df.drop('PrimarySample_board', axis=1, inplace=True)

    double_board_df = merged_df
    double_board_df = double_board_df.drop_duplicates()

    merged_df = double_board_df.merge(
        state_systems_df[['AffiliationId', 'StateSystem']],
        on='AffiliationId',
        how='left',
        suffixes=('_board', '')
    )
    merged_df['StateSystem'] = merged_df['StateSystem'].combine_first(merged_df['StateSystem_board'])
    merged_df.drop('StateSystem_board', axis=1, inplace=True)

    double_board_df = double_board_df
    double_board_df = double_board_df.drop_duplicates()
    
    board_df = remove_non_samples(board_df)
    double_board_df = remove_non_samples(double_board_df)
    board_df.to_csv(os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv"), index=False)
    double_board_df.to_csv(os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_double_boards_normalized.csv"), index=False)


  merged_df['PrimarySample'] = merged_df['PrimarySample'].combine_first(merged_df['PrimarySample_board'])
  merged_df['PrimarySample'] = merged_df['PrimarySample'].combine_first(merged_df['PrimarySample_board'])
  merged_df['PrimarySample'] = merged_df['PrimarySample'].combine_first(merged_df['PrimarySample_board'])
  merged_df['PrimarySample'] = merged_df['PrimarySample'].combine_first(merged_df['PrimarySample_board'])


In [497]:
affiliation_name_counter = defaultdict(Counter)  # Maps AffiliationId -> Counter of {InstitutionName: count}
years_data = {}  

for year in years:
    file_path = os.path.join(
        absolute_path, 
        final_scripts, 
        normalized_dataframes, 
        f"{year}_boards_normalized.csv"
    )
    
    board_df = pd.read_csv(file_path)
    board_df = remove_non_samples(board_df)
    
    # Store the DataFrame in a dict so we don't need to re-read later
    years_data[year] = board_df
    
    # Count the number of times that an affiliation id and institution shows up
    for aff_id, institution_name in zip(board_df["AffiliationId"], board_df["Institution"]):
        affiliation_name_counter[aff_id][institution_name] += 1

affiliation_final_name = {}
# Get the most commonly appearing institution name for each affiliation id
for aff_id, name_counter in affiliation_name_counter.items():
    most_common_name = name_counter.most_common(1)[0][0]
    affiliation_final_name[aff_id] = most_common_name

print("Overall affiliation mapping:")
print(affiliation_final_name)

# Remove any keys that are NaN
affiliation_final_name = {k: v for k, v in affiliation_final_name.items() if pd.notna(k)}

# Update the dataframes with the standardized names we made
for year in years:
    board_df = years_data[year]
    # Make a copy of the original Institution names for printing purposes.
    original_names = board_df["Institution"].copy()
    
    board_df["Institution"] = board_df["AffiliationId"].map(affiliation_final_name)
    
    # For each unique affiliation id, print the group and name change only if a change occurred.
    for aff in board_df["AffiliationId"].unique():
        group_old_names = original_names[board_df["AffiliationId"] == aff].unique()
        new_name = affiliation_final_name.get(aff, None)
        # Only print if the change actually occurs (i.e. not all original names equal the new name)
        if set(group_old_names) != {new_name}:
            print(f"Year {year}: AffiliationId '{aff}' had original names {group_old_names} -> changed to '{new_name}'")
    
    output_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv")
    board_df.to_csv(output_path, index=False)

    print(f"Year {year} updated and saved to: {output_path}")


Overall affiliation mapping:
{71965598.0: 'Adelphi University', 181401687.0: 'American University', 102298084.0: 'Andrews University', 55732556.0: 'Arizona State University', 82497590.0: 'Auburn University', 198089087.0: 'Ball State University', 157394403.0: 'Baylor University', 152479009.0: 'Biola University', 103531236.0: 'Boston College', 111088046.0: 'Boston University', 157417397.0: 'Bowling Green State University', 6902469.0: 'Brandeis University', 100005738.0: 'Brigham Young University', 27804330.0: 'Brown University', 122411786.0: 'California Institute Of Technology', 74973139.0: 'Carnegie Mellon University', 58956616.0: 'Case Western Reserve University', 84470341.0: 'Catholic University Of America', 1629065.0: 'Central Michigan University', 185071736.0: 'Chapman University', 174216632.0: 'City University Of New York', 52064589.0: 'Clark Atlanta University', 130785548.0: 'Clark University', 16944753.0: 'Clarkson University', 8078737.0: 'Clemson University', 102607778.0: 'Clevel

In [498]:
#determine how much institution's boards overlap
def group_institutions_by_membership(institution_to_members, threshold):
    """
    Given a dict: {institution -> set_of_members}, return a list of groups
    (lists) of institutions that have >= `threshold` overlap in membership.

    overlap = (size of intersection) / (size of smaller board)
    """
    institutions = list(institution_to_members.keys())
    n = len(institutions)

    # adjacency list for institutions with >= threshold overlap
    adjacency = defaultdict(list)
    for i in range(n):
        for j in range(i + 1, n):
            inst_i, inst_j = institutions[i], institutions[j]
            members_i = institution_to_members[inst_i]
            members_j = institution_to_members[inst_j]
            size_i = len(members_i)
            size_j = len(members_j)

            if size_i == 0 or size_j == 0:
                continue
            
            intersection_size = len(members_i.intersection(members_j))
            smaller_board_size = min(size_i, size_j)
            if smaller_board_size == 0:
                continue
            
            overlap_ratio = intersection_size / smaller_board_size
            if overlap_ratio >= threshold:
                adjacency[inst_i].append(inst_j)
                adjacency[inst_j].append(inst_i)

    #Find connected components via DFS
    visited = set()
    groups = []
    for inst in institutions:
        if inst not in visited:
            stack = [inst]
            group = []
            while stack:
                current = stack.pop()
                if current not in visited:
                    visited.add(current)
                    group.append(current)
                    for neighbor in adjacency[current]:
                        if neighbor not in visited:
                            stack.append(neighbor)
            groups.append(sorted(group))

    #Keep only groups of size >= 2 (meaning actual merges)
    return [g for g in groups if len(g) > 1]

THRESHOLD = 0.7
#want all years of the data in a single var
all_data = {}
for year in years:
    file_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv")
    df = pd.read_csv(file_path)
    df = remove_non_samples(df)
    all_data[year] = df

#map institution to board members
institution_to_members = defaultdict(set)

#map instituition to state system
institution_to_statesystem = defaultdict(set)

for df in all_data.values():
    for _, row in df.iterrows():
        inst_name = row["Institution"]
        member_id = row["Name"] 
        statesys_val = row["StateSystem"]

        institution_to_members[inst_name].add(member_id)
        if pd.notnull(statesys_val) and statesys_val.strip():
            institution_to_statesystem[inst_name].add(statesys_val.strip())

#determine which groups are categorized as identical
groups = group_institutions_by_membership(institution_to_members, THRESHOLD)

#assign the identical boards the name of their state system
institution_to_canonical_name = {}

for group in groups:
    possible_names = set()
    for inst in group:
        if institution_to_statesystem[inst]:
            possible_names.update(institution_to_statesystem[inst])
    #if one of the schools in the board group is a state system (which it will be), use that one for the name,
    #else just use the original inst name
    if len(possible_names) > 0:
        chosen_name = sorted(possible_names)[0]
    else:
        chosen_name = group[0]

    #map institutions in that group to the new name (state system name)
    for inst in group:
        institution_to_canonical_name[inst] = chosen_name

#now in each individual year, rename these boards
for year, df in all_data.items():
    # Apply the canonical naming (merge the boards)
    df["Institution"] = df["Institution"].apply(lambda x: institution_to_canonical_name.get(x, x))
    
    # --- NEW PRINT BLOCK ---
    print(f"\nIdentical board groups for Year {year}:")
    # Get the set of institutions present in this year's dataframe.
    present_institutions = set(df["Institution"].unique())
    # For each group computed from all data, check which members are present in this year.
    for group in groups:
        present = [inst for inst in group if inst in present_institutions]
        # Only print if more than one institution from the group is present (i.e. a merge will occur)
        if len(set(present)) > 1:
            canonical_name = institution_to_canonical_name[group[0]]
            print(f"Group: {group} (present: {present}) -> Merged into: {canonical_name}")
    # --- END NEW PRINT BLOCK ---

    # Instead of drop_duplicates, group by institution and select the largest board,
    # then remove all rows of the other boards.
    def keep_largest_board(group):
        # Group by board using the AffiliationId column.
        # (If an institution only appears with one board, this will just return the group as-is.)
        boards = group.groupby("AffiliationId", dropna=False)
        if len(boards) <= 1:
            return group
        # Determine which board has the most members.
        board_counts = boards.size()
        best_aff_id = board_counts.idxmax()
        # Return only the rows corresponding to the largest board.
        return group[group["AffiliationId"] == best_aff_id]
    
    df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)
    
    out_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv")
    df.to_csv(out_path, index=False)
    print(f"Year {year} merged & saved: {out_path}")



Identical board groups for Year 1999:


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


Year 1999 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\1999_boards_normalized.csv

Identical board groups for Year 2000:


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


Year 2000 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2000_boards_normalized.csv

Identical board groups for Year 2005:


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


Year 2005 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2005_boards_normalized.csv

Identical board groups for Year 2007:


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


Year 2007 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2007_boards_normalized.csv

Identical board groups for Year 2008:


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


Year 2008 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2008_boards_normalized.csv

Identical board groups for Year 2009:


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


Year 2009 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2009_boards_normalized.csv

Identical board groups for Year 2011:
Year 2011 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2011_boards_normalized.csv

Identical board groups for Year 2013:


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)
  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


Year 2013 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2013_boards_normalized.csv

Identical board groups for Year 2018:
Year 2018 merged & saved: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\normalized_dataframes\2018_boards_normalized.csv


  df = df.groupby("Institution", group_keys=False).apply(keep_largest_board).reset_index(drop=True)


In [499]:
'''
Determine what institutions report their board every year:
'''

institution_info = {}

for year in years:
    file_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv")
    board_df = pd.read_csv(file_path)
    board_df = remove_non_samples(board_df)
    
    for _, row in board_df.iterrows():
        institution = row.get("Institution")
        affiliation_id = row.get("AffiliationId")
        
        if pd.isna(institution) or normalize(institution) == "":
            continue
        
        #Initialize entry if not already present.
        if institution not in institution_info:
            institution_info[institution] = {"years": set(), "affiliation_ids": set()}
        
        #Record the year in which the institution appears.
        institution_info[institution]["years"].add(year)
        
        #If there is an affiliation id (and it's not empty after normalization), record it.
        if pd.notna(affiliation_id) and normalize(affiliation_id) != "":
            # Ensure the affiliation id is a string.
            affiliation_id = str(affiliation_id).strip()
            institution_info[institution]["affiliation_ids"].add(affiliation_id)

#Create a set of all years (to compare with each institution's set of years)
all_years = set(years)
institutions_every_year = {inst: info for inst, info in institution_info.items() if info["years"] == all_years}
institutions_certain_years = {inst: info for inst, info in institution_info.items() if info["years"] != all_years}


print("Institutions that appear in every year:")
for inst in sorted(institutions_every_year):
    info = institutions_every_year[inst]
    # Sort affiliation ids for readability
    aff_ids = sorted(info["affiliation_ids"])
    print(f"{inst}: Affiliation IDs: {aff_ids}")

print("\nInstitutions that appear in only certain years:")
for inst, info in institutions_certain_years.items():
    aff_ids = sorted(info["affiliation_ids"])
    print(f"{inst}: appears in years {sorted(info['years'])}; Affiliation IDs: {aff_ids}")


# Now, throw away the schools that don't report their board every year from all the dataframes.
for year in years:
    file_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv")
    board_df = pd.read_csv(file_path)
    board_df = remove_non_samples(board_df)
    
    # Keep only rows where the Institution appears in every year.
    # board_df = board_df[board_df["Institution"].isin(institutions_every_year.keys())]
    
    # Save the updated dataframe back to the file.
    board_df.to_csv(file_path, index=False)
    print(f"Year {year}: kept only institutions that report every year.")

Institutions that appear in every year:
Adelphi University: Affiliation IDs: ['71965598.0']
American University: Affiliation IDs: ['181401687.0']
Andrews University: Affiliation IDs: ['102298084.0']
Arizona Board of Regents: Affiliation IDs: ['138006243.0', '203172682.0', '55732556.0']
Auburn University: Affiliation IDs: ['82497590.0']
Ball State University: Affiliation IDs: ['198089087.0']
Baylor University: Affiliation IDs: ['157394403.0']
Biola University: Affiliation IDs: ['152479009.0']
Boston College: Affiliation IDs: ['103531236.0']
Boston University: Affiliation IDs: ['111088046.0']
Brandeis University: Affiliation IDs: ['6902469.0']
Brown University: Affiliation IDs: ['27804330.0']
Carnegie Mellon University: Affiliation IDs: ['74973139.0']
Case Western Reserve University: Affiliation IDs: ['58956616.0']
Catholic University Of America: Affiliation IDs: ['84470341.0']
Central Michigan University: Affiliation IDs: ['1629065.0']
Chapman University: Affiliation IDs: ['185071736.0'

In [None]:
# For each year, open the normalized board file as you normally do.
for year in years:
    file_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv")
    board_df = pd.read_csv(file_path)
    board_df = remove_non_samples(board_df)
    print(f"Processing interlocks for year: {year}")
    
    # Initialize a dictionary to track which institutions each board member serves on.
    # (The keys are cleaned board member names and the values are sets of institution names.)
    board_member_dict = defaultdict(set)
    
    # Initialize a dictionary for accumulating interlock edges.
    # Keys: tuple(sorted([inst1, inst2]))
    # Values: a dictionary with edge attributes.
    edge_accum = {}
    
    # Initialize a dictionary for node (institution) attributes.
    # Each institution will record its total interlock count and its AffiliationId.
    year_nodes_dict = defaultdict(lambda: {'Interlock_Count': 0, 'AffiliationId': None})
    
    # Process each row in the board DataFrame.
    for _, row in board_df.iterrows():
        name_raw = row["Name"]
        # Skip if the board member is marked as "vacant".
        if "vacant" in str(name_raw).lower():
            continue
        
        # Clean the board member's name.
        name = clean_name(name_raw)
        institution = row["Institution"]
        affiliation_id = row["AffiliationId"]
        
        # For each institution this board member has already been recorded on,
        # record an interlock between that institution and the current one.
        for prev_institution in board_member_dict[name]:
            if prev_institution == institution:
                continue
            pair = tuple(sorted([prev_institution, institution]))
            # Increase the weight (shared board member count) by 1.
            if pair in edge_accum:
                edge_accum[pair]['Weight'] += 1
            else:
                edge_id = f"e_{year}_{len(edge_accum)+1}"
                edge_accum[pair] = {
                    'Id': edge_id,
                    'Source': pair[0],
                    'Target': pair[1],
                    'Type': 'Undirected',
                    'Weight': 1,
                    'Year': year
                }
            # Update the interlock count for both institutions.
            year_nodes_dict[prev_institution]['Interlock_Count'] += 1
            year_nodes_dict[institution]['Interlock_Count'] += 1
        
        # Add the current institution to this board member's record.
        board_member_dict[name].add(institution)
        
        # If we haven't yet recorded an AffiliationId for this institution, do so.
        if year_nodes_dict[institution]['AffiliationId'] is None and pd.notnull(affiliation_id):
            year_nodes_dict[institution]['AffiliationId'] = affiliation_id
    
    # ---------------------------
    # Create Nodes and Edges DataFrames
    # ---------------------------
    nodes_data = []
    for inst, data in year_nodes_dict.items():
        nodes_data.append({
            'Id': inst,
            'Label': inst,
            'Interlock_Count': data['Interlock_Count'],
            'AffiliationId': data['AffiliationId']
        })
    nodes_df = pd.DataFrame(nodes_data)
    
    if edge_accum:
        edges_df = pd.DataFrame(list(edge_accum.values()))
        # Reorder columns if desired.
        edges_df = edges_df[['Id', 'Source', 'Target', 'Type', 'Weight', 'Year']]
    else:
        edges_df = pd.DataFrame(columns=['Id', 'Source', 'Target', 'Type', 'Weight', 'Year'])
    
    # ---------------------------
    # Write the Nodes and Edges DataFrames to CSV
    # ---------------------------
    out_nodes_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"interlocks\\{year}_interlocks_nodes.csv")
    out_edges_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"interlocks\\{year}_interlocks_edges.csv")
    
    nodes_df.to_csv(out_nodes_path, index=False)
    edges_df.to_csv(out_edges_path, index=False)
    
    print(f"Year {year} interlock nodes saved to: {out_nodes_path}")
    print(f"Year {year} interlock edges saved to: {out_edges_path}")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\board_analysis\\final_scripts\\normalized_dataframes\\1999_boards_normalized.csv'