In [15]:
import pandas as pd
import numpy as np
from collections import defaultdict
from matplotlib.lines import Line2D
from nameparser import HumanName
import gender_guesser.detector as gender
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from ethnicolr import pred_census_ln
from collections import defaultdict
from rapidfuzz import process, fuzz
import networkx as nx
import re

In [16]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
final_scripts = "final_scripts\\"
regression = "regression\\"
normalized_dataframes = "normalized_dataframes\\"
normalized_regression_boards = "normalized_regression_boards\\"


college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{final_scripts}{regression}{normalized_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"

# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

diversity_statistics_path = f"{altered_dataframe_path}diversity_statistics.csv"
carnegie_map = pd.read_csv(f"{college_matching_path}carnegie_map_openalex.csv")
classification_map = pd.read_csv(f"{college_matching_path}cc_download.csv")
state_path = f"{temporary_data_path}state_systems_validated.csv"   
billionaires_path = f"{temporary_data_path}billionaires_1997_2015.csv"

In [17]:
college_matching_path = f"{absolute_path}{college_matching}"
university_boards_statistics_path = f"{absolute_path}{final_scripts}{regression}regression_stats\\regression_university_board_statistics.csv"
university_admissions_path = f"{college_matching_path}university_admission_rate.csv"
university_demographics_path = f"{college_matching_path}university_student_demographics.csv"
university_faculty_path = f"{college_matching_path}university_faculty.csv"

university_admissions_df = pd.read_csv(university_admissions_path)
university_demographics_df = pd.read_csv(university_demographics_path)
university_faculty_df = pd.read_csv(university_faculty_path)


In [18]:
#helpers

def clean_name(full_name):
    """
    Removes specified substrings and patterns from a full name.
    """
    if not isinstance(full_name, str):
        return "Unknown"

    substrings_to_remove = [
        "Rev.", "SJ", "Sister", "Brother", "Father", "OP", "The Very",
        "Sr.", "O.P.","Very Rev.", "Br.", "Dr.", "Md.", "S.J.", "Very Rev",
        "M.D.", "O.P", "S.J", "J.R", "Jr.", "Jr ", "III"
    ]
    for substring in substrings_to_remove:
        full_name = full_name.replace(substring, "")

    # Remove any capital letter followed by a period and a space (e.g., "J. ")
    full_name = re.sub(r'\b[A-Z]\. ', '', full_name)
    return full_name.strip()

def get_last_name(full_name):
    """
    Extracts the last name from a full name or returns the full name if extraction fails.
    """
    full_name = clean_name(full_name)
    if not full_name or not full_name.strip():
        return str(full_name)
    parsed = HumanName(full_name)
    return str(parsed.last) if parsed.last else str(full_name)

def get_first_name(full_name):
    """
    Extracts the first name from a full name or returns the full name if extraction fails.
    """
    full_name = clean_name(full_name)
    if not full_name or not full_name.strip():
        return str(full_name)
    parsed = HumanName(full_name)
    return str(parsed.first) if parsed.first else str(full_name)


def classify_ethnicity(raw_eth):
    """
    Classifies ethnicity into 'white', 'poc', or 'unknown'.
    """
    if raw_eth == 'white':
        return 'white'
    elif raw_eth in ['black', 'hispanic', 'asian', 'aian', 'api']:
        return 'poc'
    else:
        return 'unknown'

def classify_gender(gender_str):
    """
    Classifies gender into 'male', 'female', or 'unknown'.
    """
    if gender_str in ['male', 'mostly_male']:
        return 'male'
    elif gender_str in ['female', 'mostly_female']:
        return 'female'
    else:
        return 'unknown'



def pred_gender_fn(df, first_name_col):
    """
    Predicts gender based on first names and classifies it.
    """
    d = gender.Detector()
    df['gender'] = df[first_name_col].apply(lambda x: d.get_gender(x))
    df['gender'] = df['gender'].apply(classify_gender)
    return df

def pred_ethnicity_fn(df, last_name_col):
    """
    Predicts ethnicity based on last names using Ethnicolr's pred_census_ln.
    Returns a DataFrame with a new 'ethnicity' column in {'white', 'poc', 'unknown'}.
    """
    # Ensure valid strings in last_name_col
    df[last_name_col] = df[last_name_col].fillna("Unknown").astype(str)

    # Remove non-alphanumeric chars from each last name
    df[last_name_col] = df[last_name_col].apply(
        lambda x: ''.join(filter(str.isalnum, x)) if isinstance(x, str) else "Unknown"
    )
    # Map year to Ethnicolr-supported census year
    census_year = 2010

    # Use Ethnicolr to predict ethnicity
    df = pred_census_ln(df, last_name_col, year=census_year)
    df['race'] = df['race'].fillna('unknown')
    df['ethnicity'] = df['race'].apply(classify_ethnicity)

    # Optionally drop the raw prediction column if not needed
    df.drop(columns=['race'], inplace=True, errors='ignore')

    return df


def classify_president_genders(valid_years, altered_dataframe_path):
    """
    Classifies gender for each president in the specified years and adds 'female_president'.
    Returns a dict: Institution -> female_president (bool).
    """
    president_data = {}

    for year in valid_years:
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract first names and predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['gender'] = president_df['first_name'].apply(
            lambda x: gender.Detector().get_gender(x)
        )
        president_df['gender'] = president_df['gender'].apply(classify_gender)

        # Mark female_president
        president_df['female_president'] = president_df['gender'].eq('female')

        # Store results in a dictionary by Institution
        for _, row in president_df.iterrows():
            # Overwrites if multiple rows for the same Institution
            president_data[row['AffiliationId']] = row['female_president']

    return president_data


In [19]:
def compute_board_turnover(current_board_df, previous_board_df, years_diff):
    """
    Computes the normalized board turnover for each institution.

    Args:
        current_board_df (pd.DataFrame): Current year board data with ['Institution', 'Name'].
        previous_board_df (pd.DataFrame or None): Previous year board data with ['Institution', 'Name'].
        years_diff (int): Number of years between current and previous data points.

    Returns:
        pd.DataFrame: DataFrame with ['AffiliationId', 'board_turnover'].
    """
    if previous_board_df is None or previous_board_df.empty:
        turnover_df = current_board_df[['AffiliationId']].drop_duplicates().copy()
        turnover_df['board_turnover'] = 0.0
        return turnover_df

    # Get unique institutions
    institutions = pd.concat([current_board_df['AffiliationId'], previous_board_df['AffiliationId']]).unique()
    turnover_rows = []

    for inst in institutions:
        curr_members = set(current_board_df.loc[current_board_df['AffiliationId'] == inst, 'Name'])
        prev_members = set(previous_board_df.loc[previous_board_df['AffiliationId'] == inst, 'Name'])
        swapped_members = curr_members.symmetric_difference(prev_members)
        num_swapped = len(swapped_members)

        curr_size = len(curr_members)
        prev_size = len(prev_members)
        avg_size = (curr_size + prev_size) / 2 if (curr_size + prev_size) > 0 else 1

        normalized_turnover = num_swapped / (years_diff * avg_size)
        turnover_rows.append({'AffiliationId': inst, 'board_turnover': normalized_turnover})

    return pd.DataFrame(turnover_rows)

In [20]:
def preprocess_and_count(df, category, categories, predict_fn, name_extractor):
    """
    Generalized preprocessing and counting function for gender and ethnicity.

    Args:
        df (pd.DataFrame): Board data.
        category (str): The category to predict ('gender' or 'ethnicity').
        categories (list): List of expected categories (e.g., ['male', 'female', 'unknown']).
        predict_fn (function): Function to predict the category based on name.
        name_extractor (str): The name part to extract ('first_name' or 'last_name').

    Returns:
        pd.DataFrame: Aggregated counts with renamed columns.
    """
    # Extract names and predict category
    if name_extractor == 'first_name':
        df['first_name'] = df['Name'].apply(get_first_name)
        df = predict_fn(df, 'first_name')
    elif name_extractor == 'last_name':
        df['last_name'] = df['Name'].apply(get_last_name)
        df = predict_fn(df, 'last_name')

    # Handle 'PrimarySample' column
    primary_sample_map = df.drop_duplicates(subset='AffiliationId').set_index('AffiliationId').get('PrimarySample', pd.Series(False, index=df['AffiliationId'].unique()))

    # One-hot encode the category
    category_dummies = pd.get_dummies(df[category], prefix=category)
    category_dummies = category_dummies.reindex(columns=[f"{category}_{c}" for c in categories], fill_value=0)

    # Group by Institution and AffiliationId
    grouped = pd.concat([df[['AffiliationId', 'Institution']], category_dummies], axis=1)
    counts = grouped.groupby('AffiliationId').agg({
        'Institution': 'first', 
        **{col: 'sum' for col in category_dummies.columns}
    }).reset_index()

    # Calculate total counts
    count_cols = [f"{category}_{c}" for c in categories]
    total_col = f"total_{category}_board"
    counts[total_col] = counts[count_cols].sum(axis=1)

    # Map PrimarySample
    counts['PrimarySample'] = counts['AffiliationId'].map(primary_sample_map).fillna(False)

    return counts


In [21]:
def process_board_data():
    """
    Processes board data across multiple years to compute gender and ethnicity counts,
    predicts president gender, calculates normalized board turnover, and compiles
    all information into a single DataFrame.

    Returns:
        pd.DataFrame: Compiled DataFrame with all computed metrics.
    """
    numbers_list = []
    prev_board_df = None
    prev_year = None
    sorted_years = sorted(years)

    for year in sorted_years:
        print(f"Processing year: {year}")

        # Load and process president data
        president_df = pd.read_csv(f"{altered_dataframe_path}{year}_presidents.csv")
        president_df[['first_name', 'last_name']] = president_df['Name'].apply(lambda x: pd.Series({
            'first_name': get_first_name(x),
            'last_name': get_last_name(x)
        }))
        president_df = pred_gender_fn(president_df, 'first_name')
        president_df['female_president'] = president_df['gender'] == 'female'
        president_df = president_df.drop_duplicates(subset='Institution').reset_index(drop=True)
        president_data = president_df.set_index('Institution')['female_president'].to_dict()

        board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_boards_normalized_regression.csv"
        double_board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_double_boards_normalized_regression.csv"
        board_df = pd.read_csv(board_path)
        double_board_df = pd.read_csv(double_board_path)

        # Preprocess and count gender and ethnicity for both board_df and double_board_df
        gender_categories = ['male', 'female', 'unknown']
        ethnicity_categories = ['white', 'poc', 'unknown']

        board_gender_counts = preprocess_and_count(board_df, 'gender', gender_categories, pred_gender_fn, 'first_name')
        board_ethnicity_counts = preprocess_and_count(board_df, 'ethnicity', ethnicity_categories, pred_ethnicity_fn, 'last_name')

        double_gender_counts = preprocess_and_count(double_board_df, 'gender', gender_categories, pred_gender_fn, 'first_name')
        double_ethnicity_counts = preprocess_and_count(double_board_df, 'ethnicity', ethnicity_categories, pred_ethnicity_fn, 'last_name')

        # Rename columns appropriately
        rename_map = {
            'gender_male': 'male_board',
            'gender_female': 'female_board',
            'gender_unknown': 'unknown_board',
            'total_gender_board': 'total_members_board',
            'ethnicity_white': 'white_board',
            'ethnicity_poc': 'poc_board',
            'ethnicity_unknown': 'unknown_eth_board',
            'total_ethnicity_board': 'total_ethnicity_board'
        }
        board_gender_counts.rename(columns=rename_map, inplace=True)
        board_ethnicity_counts.rename(columns={
            'ethnicity_white': 'white_board',
            'ethnicity_poc': 'poc_board',
            'ethnicity_unknown': 'unknown_eth_board',
            'total_ethnicity_board': 'total_ethnicity_board'
        }, inplace=True)

        double_gender_counts.rename(columns={
            'gender_male': 'male_double',
            'gender_female': 'female_double',
            'gender_unknown': 'unknown_double',
            'total_gender_board': 'total_members_double'
        }, inplace=True)
        double_ethnicity_counts.rename(columns={
            'ethnicity_white': 'white_double',
            'ethnicity_poc': 'poc_double',
            'ethnicity_unknown': 'unknown_eth_double',
            'total_ethnicity_board': 'total_ethnicity_double'
        }, inplace=True)

        # Merge gender and ethnicity counts
        merged_gender = board_gender_counts.merge(double_gender_counts, on=['Institution', 'AffiliationId', 'PrimarySample'], how='outer')
        merged_ethnicity = board_ethnicity_counts.merge(double_ethnicity_counts, on=['Institution', 'AffiliationId', 'PrimarySample'], how='outer')
        merged_counts = merged_gender.merge(merged_ethnicity, on=['Institution', 'AffiliationId', 'PrimarySample'], how='outer')
        # Fill NaN values with 0 and convert to integers
        count_columns = [
            'male_board', 'female_board', 'unknown_board', 'total_members_board',
            'male_double', 'female_double', 'unknown_double', 'total_members_double',
            'white_board', 'poc_board', 'unknown_eth_board', 'total_ethnicity_board',
            'white_double', 'poc_double', 'unknown_eth_double', 'total_ethnicity_double'
        ]
        merged_counts[count_columns] = merged_counts[count_columns].fillna(0).astype(int)

        # Compute weighted averages for gender and ethnicity
        for category in ['male', 'female', 'unknown', 'white', 'poc', 'unknown_eth']:
            board_col = f"{category}_board" if category not in ['unknown_eth'] else f"{category}_board"
            double_col = f"{category}_double" if category not in ['unknown_eth'] else f"{category}_double"
            total_board = 'total_members_board' if category in ['male', 'female', 'unknown'] else 'total_ethnicity_board'
            total_double = 'total_members_double' if category in ['male', 'female', 'unknown'] else 'total_ethnicity_double'

            merged_counts[category] = (
                (merged_counts[board_col] * merged_counts[total_board] +
                 merged_counts[double_col] * merged_counts[total_double]) /
                (merged_counts[total_board] + merged_counts[total_double]).replace(0, 1)
            ).round().astype(int)
        # Calculate total_members and total_ethnicity
        merged_counts['total_members'] = merged_counts['total_members_board'] + merged_counts['total_members_double']
        merged_counts['total_ethnicity'] = merged_counts['total_ethnicity_board'] + merged_counts['total_ethnicity_double']

        # Add 'Year' and 'female_president' columns
        merged_counts['Year'] = year
        merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)

        # Select relevant columns
        main_cols = [
            'Year', 'Institution', 'AffiliationId', 'female_president', 'PrimarySample',
            'total_members', 'male', 'female', 'unknown',
            'total_ethnicity', 'white', 'poc', 'unknown_eth'
        ]
        counts_df = merged_counts[main_cols].copy()

        # Calculate normalized board turnover
        years_diff = int(year) - int(prev_year) if prev_year is not None else 1
        turnover_df = compute_board_turnover(
            current_board_df=board_df[['AffiliationId', 'Name']],
            previous_board_df=prev_board_df[['AffiliationId', 'Name']] if prev_board_df is not None else None,
            years_diff=years_diff
        )

        # Merge turnover into counts_df
        counts_df = counts_df.merge(turnover_df, on='AffiliationId', how='left')
        counts_df['board_turnover'] = counts_df['board_turnover'].fillna(0.0).astype(float)

        # Append to list
        numbers_list.append(counts_df)

        # Update previous board data and year
        prev_board_df = board_df.copy()
        prev_year = year

    # Concatenate all yearly data
    numbers_df = pd.concat(numbers_list, ignore_index=True).fillna(0.0)

    # Sort the DataFrame
    numbers_df.sort_values(by=['Year', 'Institution', 'AffiliationId'], inplace=True)

    # Calculate yearly changes
    change_cols = ['male', 'female', 'unknown', 'white', 'poc', 'unknown_eth']
    for col in change_cols:
        numbers_df[f"{col}_change"] = numbers_df.groupby(['Institution', 'AffiliationId'])[col].diff().fillna(0).astype(int)

    # Final sort
    numbers_df.sort_values(by=['Year', 'Institution', 'AffiliationId'], inplace=True)

    return numbers_df

In [22]:

university_board_statistics_df = process_board_data()

Processing year: 1999


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2000


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2005


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2007


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2008


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2009


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2011


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2013


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2018


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


In [23]:
# 1. Remove any existing 'carnegie_id', 'state', 'control' columns
university_board_statistics_df.drop(columns=['carnegie_id', 'state', 'control'], errors='ignore', inplace=True)

# 2. Merge 'carnegie_id' from carnegie_map based on 'AffiliationId'
university_board_statistics_df= university_board_statistics_df.merge(
    carnegie_map[['AffiliationId', 'carnegie_id']], 
    on='AffiliationId', 
    how='left'
)
# 3. Merge 'state' and 'control' from classification_map based on 'carnegie_id'
university_board_statistics_df= university_board_statistics_df.merge(
    classification_map[['unitid', 'state', 'control']], 
    left_on='carnegie_id', 
    right_on='unitid', 
    how='left', 
    suffixes=('', '_classification')
)
university_board_statistics_df.drop(columns='unitid', inplace=True)
university_board_statistics_df.rename(
    columns={'state_classification': 'state', 'control_classification': 'control'},
    inplace=True
)

# 6. Remove any columns that start with "Unnamed"
university_board_statistics_df= university_board_statistics_df.loc[:, ~university_board_statistics_df.columns.str.contains('^Unnamed')]

university_board_statistics_df['control'] = university_board_statistics_df['control'].apply(
    lambda x: 'Private' if 'Private' in str(x) else 'Public' if 'Public' in str(x) else x
)

In [24]:
state_system_df = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_systems.csv")
university_board_statistics_df['StateSystem'] = False

for index, row in state_system_df.iterrows():
    if pd.notna(row['StateSystem']):
        affiliation_id = row['AffiliationId']
        state_system_value = row['StateSystem']
        
        # Only match on AffiliationId
        university_board_statistics_df.loc[
            university_board_statistics_df['AffiliationId'] == affiliation_id, 
            'StateSystem'
        ] = state_system_value


  university_board_statistics_df.loc[


In [25]:
state_to_region = {
    # Northeast
    "CT": "Northeast", "ME": "Northeast", "MA": "Northeast", "NH": "Northeast",
    "RI": "Northeast", "VT": "Northeast", "NJ": "Northeast", "NY": "Northeast", 
    "PA": "Northeast", "DC": "Northeast",
    
    # Midwest
    "IL": "Midwest", "IN": "Midwest", "IA": "Midwest", "KS": "Midwest",
    "MI": "Midwest", "MN": "Midwest", "MO": "Midwest", "NE": "Midwest", 
    "ND": "Midwest", "OH": "Midwest", "SD": "Midwest", "WI": "Midwest",
    
    # South
    "AL": "South", "AR": "South", "DE": "South", "FL": "South",
    "GA": "South", "KY": "South", "LA": "South", "MD": "South",
    "MS": "South", "NC": "South", "OK": "South", "SC": "South",
    "TN": "South", "TX": "South", "VA": "South", "WV": "South",
    
    # West
    "AK": "West", "AZ": "West", "CA": "West", "CO": "West",
    "HI": "West", "ID": "West", "MT": "West", "NV": "West",
    "NM": "West", "OR": "West", "UT": "West", "WA": "West",
    "WY": "West"
}
university_board_statistics_df['region'] = university_board_statistics_df['state'].map(state_to_region)

In [26]:
state_to_abbreviation = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY"
}

def update_region_from_system(row):
    """
    Scans the 'Institution' field for any full state name (case-insensitive).
    If found, uses state_to_abbreviation to get the state abbreviation and then
    state_to_region to get the region. If no match is found, returns the existing
    value of 'region' (or 'Unknown' if not present).
    """
    print(row)
    institution = str(row['StateSystem'])
    for full_state, abbrev in state_to_abbreviation.items():
        # Check if the full state name appears in the institution name.
        if full_state.lower() in institution.lower():
            # Return the region from the state_to_region mapping.
            return state_to_region.get(abbrev, "Unknown")
    # If no state name is found, return the current region value (or default to "Unknown")
    return row.get('region', "Unknown")


def update_control_from_system(row):
    """
    If the 'control' field is empty (or NaN) and the 'StateSystem' field
    indicates a state system (i.e. it is not empty and not "False"), then 
    update control to "Public". Otherwise, leave the current control value.
    """
    current_control = row.get('control', "")
    # Check if control is missing or empty.
    if pd.isna(current_control) or str(current_control).strip() == "":
        state_sys = str(row.get('StateSystem', "")).strip()
        # If StateSystem is not empty and not "False" (ignoring case), assume it's a state system.
        if state_sys and state_sys.lower() != "false":
            return "Public"
    # Otherwise, return the current control value.
    return current_control

    
university_board_statistics_df['region'] = university_board_statistics_df.apply(update_region_from_system, axis=1)
university_board_statistics_df['control'] = university_board_statistics_df.apply(update_control_from_system, axis = 1)

print(university_board_statistics_df[['Institution', 'region']].head())
university_board_statistics_df = university_board_statistics_df.drop_duplicates()
billionaire_df = pd.read_csv(billionaires_path)
billionaire_names = billionaire_df["full_name"].values
billionaire_names = set([value.replace("& family", "") for value in billionaire_names])


Year                                1999
Institution           Adelphi University
AffiliationId                 71965598.0
female_president                   False
PrimarySample                       True
total_members                         17
male                                  13
female                                 3
unknown                                1
total_ethnicity                       17
white                                 15
poc                                    2
unknown_eth                            0
board_turnover                       0.0
male_change                            0
female_change                          0
unknown_change                         0
white_change                           0
poc_change                             0
unknown_eth_change                     0
carnegie_id                     188429.0
state                                 NY
control                          Private
StateSystem                        False
region          

In [27]:
billionaire_df = pd.read_csv(billionaires_path)
billionaire_names = billionaire_df["full_name"].values
billionaire_names = set([value.replace("& family", "") for value in billionaire_names])


similarity_threshold = 90
billionaire_names = [name.strip().lower() for name in billionaire_names]
university_board_statistics_df['num_billionaires'] = 0


# Create a temporary normalized column for matching
# university_board_statistics_df['Year'] = university_board_statistics_df['Year'].astype(int)

# Ensure 'num_billionaires' is numeric
university_board_statistics_df['num_billionaires'] = pd.to_numeric(university_board_statistics_df['num_billionaires'], errors='coerce').fillna(0).astype(int)

billionaire_board_members = []

for year in years:
    print(f"\nProcessing year: {year}")
    board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_boards_normalized_regression.csv"
    # double_board_path = f"{absolute_path}{final_scripts}{regression}{normalized_regression_boards}{year}_double_boards_normalized_regression.csv"
    board_df = pd.read_csv(board_path)
    # double_board_df = pd.read_csv(double_board_path)

    board_df['Name'] = board_df['Name'].str.strip().str.lower()

    # Create a lookup dictionary for affiliationid by name
    name_to_institution = dict(zip(board_df['Name'], board_df['AffiliationId']))
    all_board_names = board_df['Name'].tolist()
    counts_per_institution = defaultdict(int)

    # Match each board member to billionaire names
    for member in all_board_names:
        result = process.extractOne(member, billionaire_names, scorer=fuzz.ratio, score_cutoff=similarity_threshold)
        if result is not None:
            match, score, _ = result
            billionaire_board_members.append((member, match, score, year))
            
            institution_name = name_to_institution.get(member)
            if institution_name:
                counts_per_institution[institution_name] += 1

    print(counts_per_institution)

    #direct lookup
    for institution, count in counts_per_institution.items():
        found_match = False

        # Iterate over each row and compare year and institution directly
        for idx in university_board_statistics_df.index:
            row_year = university_board_statistics_df.at[idx, 'Year']
            row_institution = university_board_statistics_df.at[idx, 'AffiliationId']

            if str(row_year) == str(year) and str(row_institution) == str(institution):
                # Increment num_billionaires for this row
                current_value = university_board_statistics_df.at[idx, 'num_billionaires']
                university_board_statistics_df.at[idx, 'num_billionaires'] = current_value + count
                found_match = True

    counts_per_institution.clear()

# Final Summary
print(f"\nTotal number of board members who are billionaires (similarity ≥ {similarity_threshold}%): {len(billionaire_board_members)}")
for board_member, matched_billionaire, similarity, year in billionaire_board_members:
    print(f"Year: {year}, Board Member: {board_member}, Matched Billionaire: {matched_billionaire}, Similarity: {similarity}%")




Processing year: 1999
defaultdict(<class 'int'>, {74973139.0: 1, 185071736.0: 1, 205783295.0: 1, 150468666.0: 1, 193531525.0: 1, 57206974.0: 4, 184565670.0: 1, 145311948.0: 2, 26347476.0: 1, 63966007.0: 1, 155173764.0: 1, 97018004.0: 1, 84392919.0: 1, 121934306.0: 1, 40347166.0: 1, 127591826.0: 1, 1174212.0: 5, 161515732.0: 1, 19772626.0: 1})

Processing year: 2000
defaultdict(<class 'int'>, {157394403.0: 1, 74973139.0: 1, 185071736.0: 1, 130785548.0: 1, 205783295.0: 1, 150468666.0: 1, 193531525.0: 1, 184565670.0: 1, 140172145.0: 3, 145311948.0: 2, 26347476.0: 1, 57206974.0: 4, 155173764.0: 1, 97018004.0: 1, 84392919.0: 1, 121934306.0: 1, 40347166.0: 1, 127591826.0: 1, 1174212.0: 3, 161515732.0: 3, 200719446.0: 1, 19772626.0: 1})

Processing year: 2005
defaultdict(<class 'int'>, {157394403.0: 1, 111088046.0: 1, 6902469.0: 2, 27804330.0: 1, 74973139.0: 1, 185071736.0: 1, 205783295.0: 1, 140172145.0: 3, 136199984.0: 1, 145311948.0: 1, 57206974.0: 4, 203088144.0: 1, 155173764.0: 1, 97018

In [28]:
# university_board_statistics_df['Year'] = university_board_statistics_df['Year'].astype(int)
# university_admissions_df['Year'] = university_admissions_df['year'].astype(int)
# university_demographics_df['Year'] = university_demographics_df['year'].astype(int)
# university_faculty_df['Year'] = university_faculty_df['year'].astype(int)


# university_board_statistics_df = university_board_statistics_df.merge(
#     university_admissions_df, 
#     on=["Year", "AffiliationId"], 
#     how="left"
# )

# # Merge demographics data into board stats
# university_board_statistics_df = university_board_statistics_df.merge(
#     university_demographics_df, 
#     on=["Year", "AffiliationId"], 
#     how="left"
# )

# # Merge demographics data into board stats
# # university_faculty_df = university_faculty_df.rename(columns={"year": "Year"})
# university_board_statistics_df = university_board_statistics_df.merge(
#     university_faculty_df[['Year', 'AffiliationId','student.demographics.faculty.women']], 
#     on=["Year", "AffiliationId"], 
#     how="left"
# )


university_board_statistics_df.to_csv(university_boards_statistics_path, index = False)
print("\nUpdated university_board_statistics saved successfully.")




Updated university_board_statistics saved successfully.
