In [6]:
import pandas as pd
from collections import defaultdict, Counter
import networkx as nx
import os
import re
import matplotlib.pyplot as plt
from rapidfuzz import fuzz
from itertools import combinations

In [7]:
absolute_path = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
board_dataframes = "board_dataframes\\"
final_scripts = "final_scripts\\"
regression = "regression\\"
normalized_dataframes = "normalized_dataframes\\"
normalized_regression_boards = "normalized_regression_boards\\"


years = ["1999", "2000", "2005", "2007", "2008", "2009", "2010", "2011", "2013", "2018"]

In [8]:
def remove_non_samples(df: pd.DataFrame) -> pd.DataFrame:
    """Filter the DataFrame to include only rows where 'PrimarySample' is True."""
    return df[df['PrimarySample'] == True]

def normalize(x):
    """Convert the input to a string and strip whitespace."""
    return str(x).strip()
    


In [9]:
affiliation_df = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_systems.csv")


In [10]:
# This dictionary will map AffiliationId -> Counter({InstitutionName: count}) from both DataFrames.
affiliation_name_counter = defaultdict(Counter)
years_board_data = {}
years_double_data = {} 

for year in years:
    board_path = os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_boards_regression.csv")
    double_board_path = os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_double_boards_regression.csv")
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    board_df = remove_non_samples(board_df)
    double_board_df = remove_non_samples(double_board_df)
    
    years_board_data[year] = board_df
    years_double_data[year] = double_board_df
    
    for aff_id, institution_name in zip(board_df["AffiliationId"], board_df["Institution"]):
        affiliation_name_counter[aff_id][institution_name] += 1
        
    for aff_id, institution_name in zip(double_board_df["AffiliationId"], double_board_df["Institution"]):
        affiliation_name_counter[aff_id][institution_name] += 1

# Create final mapping: for each AffiliationId, choose the most common institution name across both DataFrames.
affiliation_final_name = {}
for aff_id, name_counter in affiliation_name_counter.items():
    most_common_name = name_counter.most_common(1)[0][0]
    affiliation_final_name[aff_id] = most_common_name



affiliation_final_name = {k: v for k, v in affiliation_final_name.items() if pd.notna(k)}

#Now update both the board and double board DataFrames using the standardized names.
for year in years:
    board_df = years_board_data[year]
    double_board_df = years_double_data[year]
    
    original_board_names = board_df["Institution"].copy()
    original_double_names = double_board_df["Institution"].copy()
    
    #map the standardized institution names based on AffiliationId
    board_df["Institution"] = board_df["AffiliationId"].map(affiliation_final_name)
    double_board_df["Institution"] = double_board_df["AffiliationId"].map(affiliation_final_name)
    
    #log changes: For each unique affiliation id, if the original names differ from the new standardized name, print the change.
    for aff in board_df["AffiliationId"].unique():
        group_old_names = original_board_names[board_df["AffiliationId"] == aff].unique()
        new_name = affiliation_final_name.get(aff, None)
        if set(group_old_names) != {new_name}:
            print(f"Year {year}: Board AffiliationId '{aff}' had original names {group_old_names} -> changed to '{new_name}'")
    
    for aff in double_board_df["AffiliationId"].unique():
        group_old_names = original_double_names[double_board_df["AffiliationId"] == aff].unique()
        new_name = affiliation_final_name.get(aff, None)
        if set(group_old_names) != {new_name}:
            print(f"Year {year}: Double Board AffiliationId '{aff}' had original names {group_old_names} -> changed to '{new_name}'")
    
    board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_boards_normalized_regression.csv"
    double_board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_double_boards_normalized_regression.csv"
    board_df.to_csv(board_path, index=False)
    double_board_df.to_csv(double_board_path, index=False)


Year 1999: Board AffiliationId '200885203.0' had original names ['Indiana University Of Pennsylvania'] -> changed to 'Indiana University of Pennsylvania'
Year 1999: Board AffiliationId '102179633.0' had original names ['New School University'] -> changed to 'New School'
Year 1999: Board AffiliationId '57206974.0' had original names ['New York University' 'Polytechnic University'] -> changed to 'New York University'
Year 1999: Board AffiliationId '35777872.0' had original names ['North Carolina Agricultural And Technical State University'] -> changed to 'North Carolina Agricultural and Technical State University'
Year 1999: Board AffiliationId '2801502357.0' had original names ['Southern Illinois University'] -> changed to 'Southern Illinois University System'
Year 1999: Board AffiliationId '392282.0' had original names ['State University Of New York At Albany'] -> changed to 'State University of New York at Albany'
Year 1999: Board AffiliationId '63190737.0' had original names ['State 

In [11]:
'''
Determine what institutions report their board every year:
'''
affiliation_info = {}

for year in years:
    board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_boards_normalized_regression.csv"
    double_board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_double_boards_normalized_regression.csv"
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    board_df = remove_non_samples(board_df)
    double_board_df = remove_non_samples(double_board_df)
    
    for _, row in board_df.iterrows():
        affiliation_id = row.get("AffiliationId")
        if pd.isna(affiliation_id) or normalize(str(affiliation_id)) == "":
            continue
        affiliation_id = str(affiliation_id).strip()
        if affiliation_id not in affiliation_info:
            affiliation_info[affiliation_id] = {"years": set()}
        affiliation_info[affiliation_id]["years"].add(year)
    
    for _, row in double_board_df.iterrows():
        affiliation_id = row.get("AffiliationId")
        if pd.isna(affiliation_id) or normalize(str(affiliation_id)) == "":
            continue
        affiliation_id = str(affiliation_id).strip()
        if affiliation_id not in affiliation_info:
            affiliation_info[affiliation_id] = {"years": set()}
        affiliation_info[affiliation_id]["years"].add(year)

all_years = set(years)
affiliations_every_year = {aff: info for aff, info in affiliation_info.items() if info["years"] == all_years}
affiliations_certain_years = {aff: info for aff, info in affiliation_info.items() if info["years"] != all_years}

# print("Affiliation IDs that appear in every year:")
# for aff in sorted(affiliations_every_year):
#     info = affiliations_every_year[aff]
#     print(f"{aff}: Years: {sorted(info['years'])}")

# print("\nAffiliation IDs that appear in only certain years:")
# for aff, info in affiliations_certain_years.items():
#     print(f"{aff}: Years: {sorted(info['years'])}")

print(len(affiliations_every_year), len(affiliations_certain_years))


227 66


In [12]:
for year in years:
    board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_boards_normalized_regression.csv"
    double_board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_double_boards_normalized_regression.csv"
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    
    board_df = remove_non_samples(board_df)
    double_board_df = remove_non_samples(double_board_df)
    
    board_df["AffiliationId"] = board_df["AffiliationId"].astype(str).str.strip()
    double_board_df["AffiliationId"] = double_board_df["AffiliationId"].astype(str).str.strip()
    
    # Ensure the keys in affiliations_every_year are strings
    aff_keys = set(str(key).strip() for key in affiliations_every_year.keys())
    
    board_df = board_df[ board_df["AffiliationId"].isin(aff_keys) ]
    double_board_df = double_board_df[ double_board_df["AffiliationId"].isin(aff_keys) ]
    
    board_df.to_csv(board_path, index=False)
    double_board_df.to_csv(double_board_path, index=False)
    
    print(f"Year {year}: kept only affiliation IDs that report every year.")
    print(f"  Boards saved to: {board_path}")
    print(f"  Double Boards saved to: {double_board_path}")

Year 1999: kept only affiliation IDs that report every year.
  Boards saved to: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\regression\normalized_regression_boards\1999_boards_normalized_regression.csv
  Double Boards saved to: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\regression\normalized_regression_boards\1999_double_boards_normalized_regression.csv
Year 2000: kept only affiliation IDs that report every year.
  Boards saved to: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\regression\normalized_regression_boards\2000_boards_normalized_regression.csv
  Double Boards saved to: C:\Users\tykun\OneDrive\Documents\SchoolDocs\VSCodeProjects\connectedData\board_analysis\final_scripts\regression\normalized_regression_boards\2000_double_boards_normalized_regression.csv
Year 2005: kept only affiliation IDs that report e

In [13]:
#time invariant dfs
affiliation_df = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_systems.csv")
original_affiliation_df  = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_original.csv")

#create a dictionary mapping state system -> institutions within the system
affiliation_dict = affiliation_df.groupby("StateSystem")["AffiliationId"].apply(list).to_dict()

affiliation_inverted = {
    aff_id: state_system
    for state_system, aff_ids in affiliation_dict.items()
    for aff_id in aff_ids
}

matched_df = original_affiliation_df[original_affiliation_df["FullName"].isin(affiliation_dict.keys())].copy()

matched_df["StateSystem"] = matched_df["FullName"]

mapping_df = matched_df[["StateSystem", "AffiliationId"]]

mapping_df = mapping_df.drop_duplicates()

system_id_map = {}
nan_id_map = {}

for _, row in mapping_df.iterrows():
    state_system = row["StateSystem"]
    board_aff_id = row["AffiliationId"]
    if pd.isna(board_aff_id):
        nan_id_map[state_system] = affiliation_dict.get(state_system, [])
    else:
        system_id_map[board_aff_id] = affiliation_dict.get(state_system, [])

# Display the resulting dictionary
print(system_id_map)
print(nan_id_map)

{127339247.0: [184813773, 67328108, 67328108, 142934699, 142934699, 142934699, 59897056, 59897056, 43369023, 43369023, 26538001, 71838634], 174216632.0: [125687163, 125687163, 125687163], 4210131357.0: [92446798, 92446798, 92446798, 92446798], 4210165361.0: [120156002, 106969075, 155093810], 2801365651.0: [189590672, 146416000, 39587148], 4210141039.0: [61937129, 99041443, 368840534, 44854399], 4210127926.0: [57328836, 24571045, 24571045], 29957033.0: [200885203, 161171246, 161171246], 1327163397.0: [392282, 392282, 392282, 392282, 123946342, 123946342, 123946342, 123946342, 63190737, 63190737, 63190737, 63190737, 59553526, 59553526, 59553526, 59553526, 59553526], 2801649442.0: [8248082, 63772739, 19700959, 106165777, 33213144, 11874761, 2613432], 173268674.0: [75063564, 91045830, 91045830, 91045830, 206651237, 206651237, 96749437, 96749437, 164185940, 164185940, 181414168, 181414168], 2801273398.0: [191429286, 13511017, 13511017, 13511017, 13511017], 4210088475.0: [12315562, 926076166

In [14]:
'''
The interlocks that we count here are not going to be the interlocks used for the network. this is to identify interlocking institutions so we can clean the members names
'''

substrings_to_remove = [
    "Rev.", "SJ", "Sister", "Brother", "Father", "OP", "The Very",
    "Sr.", "O.P.", "Very Rev.", "Br.", "Dr.", "Md.", "S.J.", "Very Rev",
    "M.D.", "O.P", "S.J", "J.R", "Jr.", "Jr ", "III"
]

def clean_name(raw_name: str) -> str:
    for title in substrings_to_remove:
        raw_name = re.sub(r'\b' + re.escape(title.strip()) + r'\b', '', raw_name, flags=re.IGNORECASE)
    raw_name = re.sub(r'[^\w\s]', '', raw_name)
    return " ".join(raw_name.split()).title()

# Convert system_id_map values to a list of sets.
# For each value, first cast to int then to str.
system_lists = []
for vals in system_id_map.values():
    sys_set = set(str(int(x)).strip() for x in vals if pd.notna(x))
    if sys_set and sys_set not in system_lists:
        system_lists.append(sys_set)

for year in years:
    board_path = os.path.join(absolute_path, f"{final_scripts}{regression}{normalized_regression_boards}{year}_boards_normalized_regression.csv")
    double_board_path = os.path.join(absolute_path, f"{final_scripts}{regression}{normalized_regression_boards}{year}_double_boards_normalized_regression.csv")
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    
    board_df = remove_non_samples(board_df)
    double_board_df = remove_non_samples(double_board_df)
    
    combined_df = pd.concat([board_df, double_board_df], ignore_index=True)
    board_sizes = combined_df.groupby("Institution").size().to_dict()
    
    board_member_dict = defaultdict(set)
    edge_accum = {}
    # Each institution's affiliation id will be stored as a string of the int value.
    year_nodes_dict = defaultdict(lambda: {'Interlock_Count': 0, 'AffiliationId': None})
    
    for _, row in combined_df.iterrows():
        name_raw = row["Name"]
        if "vacant" in str(name_raw).lower():
            continue
        name = clean_name(name_raw)
        institution = row["Institution"]
        if pd.notnull(row["AffiliationId"]):
            try:
                affiliation_id = str(int(float(row["AffiliationId"]))).strip()
            except Exception:
                affiliation_id = str(row["AffiliationId"]).strip()
        else:
            affiliation_id = None

        for prev_institution in board_member_dict[name]:
            if prev_institution == institution:
                continue
            # Get affiliation IDs for both institutions, casting to int then to str.
            id1 = None
            if year_nodes_dict.get(prev_institution, {}).get('AffiliationId') is not None:
                try:
                    id1 = str(int(float(year_nodes_dict.get(prev_institution)['AffiliationId']))).strip()
                except Exception:
                    id1 = str(year_nodes_dict.get(prev_institution)['AffiliationId']).strip()
            id2 = affiliation_id
            same_system = False
            if id1 and id2:
                for sys_set in system_lists:
                    if id1 in sys_set and id2 in sys_set:
                        same_system = True
                        break
            if same_system:
                continue
            pair = tuple(sorted([prev_institution, institution]))
            if pair not in edge_accum:
                edge_id = f"e_{year}_{len(edge_accum)+1}"
                edge_accum[pair] = {
                    'Id': edge_id,
                    'Source': pair[0],
                    'Target': pair[1],
                    'Type': 'Undirected',
                    'Shared': 0,
                    'Year': year
                }
            edge_accum[pair]['Shared'] += 1
            year_nodes_dict[prev_institution]['Interlock_Count'] += 1
            year_nodes_dict[institution]['Interlock_Count'] += 1
        board_member_dict[name].add(institution)
        if year_nodes_dict[institution]['AffiliationId'] is None and affiliation_id:
            year_nodes_dict[institution]['AffiliationId'] = affiliation_id

    for pair, data in edge_accum.items():
        inst1, inst2 = pair
        size1 = board_sizes.get(inst1, 0)
        size2 = board_sizes.get(inst2, 0)
        denom = size1 + size2 if (size1 + size2) > 0 else 1
        data['Weight'] = data['Shared'] / denom

    nodes_data = []
    for inst, data in year_nodes_dict.items():
        nodes_data.append({
            'Id': inst,
            'Label': inst,
            'Interlock_Count': data['Interlock_Count'],
            'AffiliationId': data['AffiliationId'],
            'Board_Size': board_sizes.get(inst, 0)
        })
    nodes_df = pd.DataFrame(nodes_data)
    
    if edge_accum:
        edges_df = pd.DataFrame(list(edge_accum.values()))
        edges_df = edges_df[['Id', 'Source', 'Target', 'Type', 'Weight', 'Year']]
    else:
        edges_df = pd.DataFrame(columns=['Id', 'Source', 'Target', 'Type', 'Weight', 'Year'])
    
    # out_nodes_path = os.path.join(absolute_path, f"{final_scripts}regression\\interlocks\\{year}_interlocks_nodes.csv")
    # out_edges_path = os.path.join(absolute_path, f"{final_scripts}regression\\interlocks\\{year}_interlocks_edges.csv")
    # nodes_df.to_csv(out_nodes_path, index=False)
    # edges_df.to_csv(out_edges_path, index=False)
    
    # print(f"Year {year} interlock nodes saved to: {out_nodes_path}")
    # print(f"Year {year} interlock edges saved to: {out_edges_path}")


In [15]:
name_counts = defaultdict(int)

for year in years:
    board_path = os.path.join(absolute_path, f"{final_scripts}regression\\normalized_regression_boards\\{year}_boards_normalized_regression.csv")
    double_board_path = os.path.join(absolute_path, f"{final_scripts}regression\\normalized_regression_boards\\{year}_double_boards_normalized_regression.csv")
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    board_df = remove_non_samples(board_df)
    double_board_df = remove_non_samples(double_board_df)

    for name in board_df["Name"].dropna():
        name_counts[str(name).strip()] += 1
    for name in double_board_df["Name"].dropna():
        name_counts[str(name).strip()] += 1

all_names = list(name_counts.keys())
sorted_names = sorted(all_names, key=lambda n: name_counts[n], reverse=True)

name_mapping = {}
used = set()
fuzzy_threshold = 90

for name in sorted_names:
    if name in used:
        continue
    similar_names = []
    for candidate in all_names:
        if candidate in used:
            continue
        if fuzz.token_set_ratio(name, candidate) >= fuzzy_threshold:
            similar_names.append(candidate)
    canonical = max(similar_names, key=lambda n: name_counts[n])
    for candidate in similar_names:
        name_mapping[candidate] = canonical
        used.add(candidate)

print("Name mapping (only changes):")
for orig, canon in name_mapping.items():
    if orig != canon:
        print(f"{orig} -> {canon}")

for year in years:
    board_path = os.path.join(absolute_path, f"{final_scripts}regression\\normalized_regression_boards\\{year}_boards_normalized_regression.csv")
    double_board_path = os.path.join(absolute_path, f"{final_scripts}regression\\normalized_regression_boards\\{year}_double_boards_normalized_regression.csv")
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    board_df = remove_non_samples(board_df)
    double_board_df = remove_non_samples(double_board_df)
    
    board_df["Name"] = board_df["Name"].apply(lambda x: name_mapping.get(str(x).strip(), x) if pd.notnull(x) else x)
    double_board_df["Name"] = double_board_df["Name"].apply(lambda x: name_mapping.get(str(x).strip(), x) if pd.notnull(x) else x)
    
    board_df.to_csv(board_path, index=False)
    double_board_df.to_csv(double_board_path, index=False)
    print(f"Year {year} updated with standardized names.")


Name mapping (only changes):
Dr. Charles Reed -> Charles B. Reed
Francie Frederick -> Francie A. Frederick
Francie A. Frederick* -> Francie A. Frederick
Monica Lozano -> Monica C. Lozano
Frederick R. "Fred" Ruiz -> Frederick R. Ruiz
Paul Wachter -> Paul D. Wachter
Gerald L. Parks -> Gerald L. Parsky
Linda Sanford -> Linda S. Sanford
Linda S. Sanford.* -> Linda S. Sanford
John Perez -> John A. Perez
Edmund Gerald Brown, Jr. -> Edmund Gerald "Jerry" Brown, Jr
Erle A. Nye -> Erle Nye
Debra S. rarar -> Debra S. Farar
Dr. Debra S. Farar -> Debra S. Farar
George Gowgani -> George G. Gowgani
Paul L. Foster. -> Paul L. Foster
Cruz Bustamante -> Cruz M. Bustamante
James Richard Huffines -> James R. Huffines
Harvey Wachsman -> Harvey F. Wachsman
Kvriakos Tsakopoulos -> Kyriakos Tsakopoulos
Carol R. Chandler -> Carol Chandler
Gene Stallings -> Gene C. Stallings
Lube Fraga -> Lupe Fraga
Robert E. Stillwell -> Robert L. Stillwell
Colleen McHugh -> M. Colleen McHugh
I Stanley Rogers -> J. Stanley Ro