In [6]:
import pandas as pd
from collections import defaultdict, Counter
import networkx as nx
import os
import re
import matplotlib.pyplot as plt
from rapidfuzz import fuzz
from itertools import combinations

In [7]:
absolute_path = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"  
scripts = "scripts\\"
board_dataframes = "board_dataframes\\"
yearly_interlocks = "yearly_interlocks\\"
final_scripts = "final_scripts\\"
normalized_dataframes = "normalized_dataframes\\"


years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

In [8]:

# First, sort the years (if they are not already in ascending order).
# We assume that each element in `years` can be converted to an integer.
sorted_years = sorted(years, key=lambda x: int(x))

# Build a dictionary mapping each year to a dictionary of institution -> set of board members.
board_memberships = {}

for year in sorted_years:
    file_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"{year}_boards_normalized.csv")
    board_df = pd.read_csv(file_path)
    
    # Build a dictionary: for each institution, store the unique board member names.
    membership = {}
    for inst, group in board_df.groupby("Institution"):
        # Use dropna() to avoid missing names.
        membership[inst] = set(group["Name"].dropna().unique())
    board_memberships[year] = membership

# Now, for each year (except the first), compute the average board turnover.
# We define board turnover for an institution as:
#    turnover = |(board members in year_t) Δ (board members in year_t-1)| / |(board members in year_t) ∪ (board members in year_t-1)|
# (The symmetric difference divided by the union.)
turnover_by_year = {}

for i in range(1, len(sorted_years)):
    year = sorted_years[i]
    prev_year = sorted_years[i-1]
    memberships_current = board_memberships[year]
    memberships_prev = board_memberships[prev_year]
    
    turnover_list = []
    # Only consider institutions that appear in both years.
    for inst in memberships_current:
        if inst in memberships_prev:
            current_set = memberships_current[inst]
            prev_set = memberships_prev[inst]
            union_size = len(current_set.union(prev_set))
            # Avoid division by zero if union is empty.
            if union_size == 0:
                continue
            sym_diff_size = len(current_set.symmetric_difference(prev_set))
            turnover = sym_diff_size / union_size
            turnover_list.append(turnover)
    # Compute the average turnover for the year (if any institutions were comparable).
    if turnover_list:
        avg_turnover = sum(turnover_list) / len(turnover_list)
        turnover_by_year[year] = avg_turnover / (int(sorted_years[i]) - int(sorted_years[i-1]))
    else:
        turnover_by_year[year] = None

# Print the average board turnover for each year (starting with the second year).
for year in sorted_years[1:]:
    turnover = turnover_by_year.get(year)
    if turnover is not None:
        print(f"Average board turnover for year {year} (compared to {sorted_years[sorted_years.index(year)-1]}): {turnover:.2f}")
    else:
        print(f"Average board turnover for year {year} could not be computed (no common institutions).")


Average board turnover for year 2000 (compared to 1999): 0.35
Average board turnover for year 2005 (compared to 2000): 0.15
Average board turnover for year 2007 (compared to 2005): 0.23
Average board turnover for year 2008 (compared to 2007): 0.32
Average board turnover for year 2009 (compared to 2008): 0.26
Average board turnover for year 2011 (compared to 2009): 0.24
Average board turnover for year 2013 (compared to 2011): 0.24
Average board turnover for year 2018 (compared to 2013): 0.15
