In [63]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from matplotlib.lines import Line2D
from nameparser import HumanName
import gender_guesser.detector as gender
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from ethnicolr import pred_census_ln
from collections import defaultdict
from rapidfuzz import process, fuzz
import networkx as nx
import re

In [64]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
final_scripts = "final_scripts\\"
normalized_dataframes = "normalized_dataframes\\"

college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{final_scripts}{normalized_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"

# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

diversity_statistics_path = f"{altered_dataframe_path}diversity_statistics.csv"
carnegie_map = pd.read_csv(f"{college_matching_path}carnegie_map_openalex.csv")
classification_map = pd.read_csv(f"{college_matching_path}cc_download.csv")
state_path = f"{temporary_data_path}state_systems_validated.csv"   
billionaires_path = f"{temporary_data_path}billionaires_1997_2015.csv"

In [65]:
#helpers

def clean_name(full_name):
    """
    Removes specified substrings and patterns from a full name.
    """
    if not isinstance(full_name, str):
        return "Unknown"

    substrings_to_remove = [
        "Rev.", "SJ", "Sister", "Brother", "Father", "OP", "The Very",
        "Sr.", "O.P.","Very Rev.", "Br.", "Dr.", "Md.", "S.J.", "Very Rev",
        "M.D.", "O.P", "S.J", "J.R", "Jr.", "Jr ", "III"
    ]
    for substring in substrings_to_remove:
        full_name = full_name.replace(substring, "")

    # Remove any capital letter followed by a period and a space (e.g., "J. ")
    full_name = re.sub(r'\b[A-Z]\. ', '', full_name)
    return full_name.strip()

def get_last_name(full_name):
    """
    Extracts the last name from a full name or returns the full name if extraction fails.
    """
    full_name = clean_name(full_name)
    if not full_name or not full_name.strip():
        return str(full_name)
    parsed = HumanName(full_name)
    return str(parsed.last) if parsed.last else str(full_name)

def get_first_name(full_name):
    """
    Extracts the first name from a full name or returns the full name if extraction fails.
    """
    full_name = clean_name(full_name)
    if not full_name or not full_name.strip():
        return str(full_name)
    parsed = HumanName(full_name)
    return str(parsed.first) if parsed.first else str(full_name)


def classify_ethnicity(raw_eth):
    """
    Classifies ethnicity into 'white', 'poc', or 'unknown'.
    """
    if raw_eth == 'white':
        return 'white'
    elif raw_eth in ['black', 'hispanic', 'asian', 'aian', 'api']:
        return 'poc'
    else:
        return 'unknown'

def classify_gender(gender_str):
    """
    Classifies gender into 'male', 'female', or 'unknown'.
    """
    if gender_str in ['male', 'mostly_male']:
        return 'male'
    elif gender_str in ['female', 'mostly_female']:
        return 'female'
    else:
        return 'unknown'



def pred_gender_fn(df, first_name_col):
    """
    Predicts gender based on first names and classifies it.
    """
    d = gender.Detector()
    df['gender'] = df[first_name_col].apply(lambda x: d.get_gender(x))
    df['gender'] = df['gender'].apply(classify_gender)
    return df

def pred_ethnicity_fn(df, last_name_col):
    """
    Predicts ethnicity based on last names using Ethnicolr's pred_census_ln.
    Returns a DataFrame with a new 'ethnicity' column in {'white', 'poc', 'unknown'}.
    """
    # Ensure valid strings in last_name_col
    df[last_name_col] = df[last_name_col].fillna("Unknown").astype(str)

    # Remove non-alphanumeric chars from each last name
    df[last_name_col] = df[last_name_col].apply(
        lambda x: ''.join(filter(str.isalnum, x)) if isinstance(x, str) else "Unknown"
    )
    # Map year to Ethnicolr-supported census year
    census_year = 2010

    # Use Ethnicolr to predict ethnicity
    df = pred_census_ln(df, last_name_col, year=census_year)
    df['race'] = df['race'].fillna('unknown')
    df['ethnicity'] = df['race'].apply(classify_ethnicity)

    # Optionally drop the raw prediction column if not needed
    df.drop(columns=['race'], inplace=True, errors='ignore')

    return df


def classify_president_genders(valid_years, altered_dataframe_path):
    """
    Classifies gender for each president in the specified years and adds 'female_president'.
    Returns a dict: Institution -> female_president (bool).
    """
    president_data = {}

    for year in valid_years:
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract first names and predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['gender'] = president_df['first_name'].apply(
            lambda x: gender.Detector().get_gender(x)
        )
        president_df['gender'] = president_df['gender'].apply(classify_gender)

        # Mark female_president
        president_df['female_president'] = president_df['gender'].eq('female')

        # Store results in a dictionary by Institution
        for _, row in president_df.iterrows():
            # Overwrites if multiple rows for the same Institution
            president_data[row['Institution']] = row['female_president']

    return president_data


In [66]:
def compute_board_turnover(current_board_df, previous_board_df, years_diff):
    """
    Computes the normalized board turnover for each institution.

    Args:
        current_board_df (pd.DataFrame): Current year board data with ['Institution', 'Name'].
        previous_board_df (pd.DataFrame or None): Previous year board data with ['Institution', 'Name'].
        years_diff (int): Number of years between current and previous data points.

    Returns:
        pd.DataFrame: DataFrame with ['Institution', 'board_turnover'].
    """
    if previous_board_df is None or previous_board_df.empty:
        turnover_df = current_board_df[['Institution']].drop_duplicates().copy()
        turnover_df['board_turnover'] = 0.0
        return turnover_df

    # Get unique institutions
    institutions = pd.concat([current_board_df['Institution'], previous_board_df['Institution']]).unique()
    turnover_rows = []

    for inst in institutions:
        curr_members = set(current_board_df.loc[current_board_df['Institution'] == inst, 'Name'])
        prev_members = set(previous_board_df.loc[previous_board_df['Institution'] == inst, 'Name'])
        swapped_members = curr_members.symmetric_difference(prev_members)
        num_swapped = len(swapped_members)

        curr_size = len(curr_members)
        prev_size = len(prev_members)
        avg_size = (curr_size + prev_size) / 2 if (curr_size + prev_size) > 0 else 1

        normalized_turnover = num_swapped / (years_diff * avg_size)
        turnover_rows.append({'Institution': inst, 'board_turnover': normalized_turnover})

    return pd.DataFrame(turnover_rows)

In [67]:
def preprocess_and_count(df, category, categories, predict_fn, name_extractor):
    """
    Generalized preprocessing and counting function for gender and ethnicity.

    Args:
        df (pd.DataFrame): Board data.
        category (str): The category to predict ('gender' or 'ethnicity').
        categories (list): List of expected categories (e.g., ['male', 'female', 'unknown']).
        predict_fn (function): Function to predict the category based on name.
        name_extractor (str): The name part to extract ('first_name' or 'last_name').

    Returns:
        pd.DataFrame: Aggregated counts with renamed columns.
    """
    # Extract names and predict category
    if name_extractor == 'first_name':
        df['first_name'] = df['Name'].apply(get_first_name)
        df = predict_fn(df, 'first_name')
    elif name_extractor == 'last_name':
        df['last_name'] = df['Name'].apply(get_last_name)
        df = predict_fn(df, 'last_name')

    # Handle 'PrimarySample' column
    primary_sample_map = df.drop_duplicates(subset='Institution').set_index('Institution').get('PrimarySample', pd.Series(False, index=df['Institution'].unique()))

    # One-hot encode the category
    category_dummies = pd.get_dummies(df[category], prefix=category)
    category_dummies = category_dummies.reindex(columns=[f"{category}_{c}" for c in categories], fill_value=0)

    # Group by Institution and AffiliationId
    grouped = pd.concat([df[['Institution', 'AffiliationId']], category_dummies], axis=1)
    counts = grouped.groupby(['Institution', 'AffiliationId']).sum().reset_index()

    # Calculate total counts
    count_cols = [f"{category}_{c}" for c in categories]
    total_col = f"total_{category}_board"
    counts[total_col] = counts[count_cols].sum(axis=1)

    # Map PrimarySample
    counts['PrimarySample'] = counts['Institution'].map(primary_sample_map).fillna(False)

    return counts

def process_board_data():
    """
    Processes board data across multiple years to compute gender and ethnicity counts,
    predicts president gender, calculates normalized board turnover, and compiles
    all information into a single DataFrame.

    Returns:
        pd.DataFrame: Compiled DataFrame with all computed metrics.
    """
    numbers_list = []
    prev_board_df = None
    prev_year = None
    sorted_years = sorted(years)

    for year in sorted_years:
        print(f"Processing year: {year}")

        # Load and process president data
        president_df = pd.read_csv(f"{altered_dataframe_path}{year}_presidents.csv")
        president_df[['first_name', 'last_name']] = president_df['Name'].apply(lambda x: pd.Series({
            'first_name': get_first_name(x),
            'last_name': get_last_name(x)
        }))
        president_df = pred_gender_fn(president_df, 'first_name')
        president_df['female_president'] = president_df['gender'] == 'female'
        president_df = president_df.drop_duplicates(subset='Institution').reset_index(drop=True)
        president_data = president_df.set_index('Institution')['female_president'].to_dict()

        # Load board data
        board_df = pd.read_csv(f"{boards_path}{year}_boards_normalized.csv")
        double_board_df = pd.read_csv(f"{boards_path}{year}_double_boards_normalized.csv")

        # Preprocess and count gender and ethnicity for both board_df and double_board_df
        gender_categories = ['male', 'female', 'unknown']
        ethnicity_categories = ['white', 'poc', 'unknown']

        board_gender_counts = preprocess_and_count(board_df, 'gender', gender_categories, pred_gender_fn, 'first_name')
        board_ethnicity_counts = preprocess_and_count(board_df, 'ethnicity', ethnicity_categories, pred_ethnicity_fn, 'last_name')

        double_gender_counts = preprocess_and_count(double_board_df, 'gender', gender_categories, pred_gender_fn, 'first_name')
        double_ethnicity_counts = preprocess_and_count(double_board_df, 'ethnicity', ethnicity_categories, pred_ethnicity_fn, 'last_name')

        # Rename columns appropriately
        rename_map = {
            'gender_male': 'male_board',
            'gender_female': 'female_board',
            'gender_unknown': 'unknown_board',
            'total_gender_board': 'total_members_board',
            'ethnicity_white': 'white_board',
            'ethnicity_poc': 'poc_board',
            'ethnicity_unknown': 'unknown_eth_board',
            'total_ethnicity_board': 'total_ethnicity_board'
        }
        board_gender_counts.rename(columns=rename_map, inplace=True)
        board_ethnicity_counts.rename(columns={
            'ethnicity_white': 'white_board',
            'ethnicity_poc': 'poc_board',
            'ethnicity_unknown': 'unknown_eth_board',
            'total_ethnicity_board': 'total_ethnicity_board'
        }, inplace=True)

        double_gender_counts.rename(columns={
            'gender_male': 'male_double',
            'gender_female': 'female_double',
            'gender_unknown': 'unknown_double',
            'total_gender_board': 'total_members_double'
        }, inplace=True)
        double_ethnicity_counts.rename(columns={
            'ethnicity_white': 'white_double',
            'ethnicity_poc': 'poc_double',
            'ethnicity_unknown': 'unknown_eth_double',
            'total_ethnicity_board': 'total_ethnicity_double'
        }, inplace=True)

        # Merge gender and ethnicity counts
        merged_gender = board_gender_counts.merge(double_gender_counts, on=['Institution', 'AffiliationId', 'PrimarySample'], how='outer')
        merged_ethnicity = board_ethnicity_counts.merge(double_ethnicity_counts, on=['Institution', 'AffiliationId', 'PrimarySample'], how='outer')
        merged_counts = merged_gender.merge(merged_ethnicity, on=['Institution', 'AffiliationId', 'PrimarySample'], how='outer')

        # Fill NaN values with 0 and convert to integers
        count_columns = [
            'male_board', 'female_board', 'unknown_board', 'total_members_board',
            'male_double', 'female_double', 'unknown_double', 'total_members_double',
            'white_board', 'poc_board', 'unknown_eth_board', 'total_ethnicity_board',
            'white_double', 'poc_double', 'unknown_eth_double', 'total_ethnicity_double'
        ]
        merged_counts[count_columns] = merged_counts[count_columns].fillna(0).astype(int)

        # Compute weighted averages for gender and ethnicity
        for category in ['male', 'female', 'unknown', 'white', 'poc', 'unknown_eth']:
            board_col = f"{category}_board" if category not in ['unknown_eth'] else f"{category}_board"
            double_col = f"{category}_double" if category not in ['unknown_eth'] else f"{category}_double"
            total_board = 'total_members_board' if category in ['male', 'female', 'unknown'] else 'total_ethnicity_board'
            total_double = 'total_members_double' if category in ['male', 'female', 'unknown'] else 'total_ethnicity_double'

            merged_counts[category] = (
                (merged_counts[board_col] * merged_counts[total_board] +
                 merged_counts[double_col] * merged_counts[total_double]) /
                (merged_counts[total_board] + merged_counts[total_double]).replace(0, 1)
            ).round().astype(int)

        # Calculate total_members and total_ethnicity
        merged_counts['total_members'] = merged_counts['total_members_board'] + merged_counts['total_members_double']
        merged_counts['total_ethnicity'] = merged_counts['total_ethnicity_board'] + merged_counts['total_ethnicity_double']

        # Add 'Year' and 'female_president' columns
        merged_counts['Year'] = year
        merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)

        # Select relevant columns
        main_cols = [
            'Year', 'Institution', 'AffiliationId', 'female_president', 'PrimarySample',
            'total_members', 'male', 'female', 'unknown',
            'total_ethnicity', 'white', 'poc', 'unknown_eth'
        ]
        counts_df = merged_counts[main_cols].copy()

        # Calculate normalized board turnover
        years_diff = int(year) - int(prev_year) if prev_year is not None else 1
        turnover_df = compute_board_turnover(
            current_board_df=board_df[['Institution', 'Name']],
            previous_board_df=prev_board_df[['Institution', 'Name']] if prev_board_df is not None else None,
            years_diff=years_diff
        )

        # Merge turnover into counts_df
        counts_df = counts_df.merge(turnover_df, on='Institution', how='left')
        counts_df['board_turnover'] = counts_df['board_turnover'].fillna(0.0).astype(float)

        # Append to list
        numbers_list.append(counts_df)

        # Update previous board data and year
        prev_board_df = board_df.copy()
        prev_year = year

    # Concatenate all yearly data
    numbers_df = pd.concat(numbers_list, ignore_index=True).fillna(0.0)

    # Sort the DataFrame
    numbers_df.sort_values(by=['Year', 'Institution', 'AffiliationId'], inplace=True)

    # Calculate yearly changes
    change_cols = ['male', 'female', 'unknown', 'white', 'poc', 'unknown_eth']
    for col in change_cols:
        numbers_df[f"{col}_change"] = numbers_df.groupby(['Institution', 'AffiliationId'])[col].diff().fillna(0).astype(int)

    # Final sort
    numbers_df.sort_values(by=['Year', 'Institution', 'AffiliationId'], inplace=True)

    return numbers_df

In [68]:
university_board_statistics_df = process_board_data()
# university_board_statistics_df.to_csv(altered_dataframe_path + "university_board_statistics.csv", index = False)

Processing year: 1999


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2000


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2005


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2007


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2008


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2009


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2011


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2013


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


Processing year: 2018


  merged_counts['female_president'] = merged_counts['Institution'].map(president_data).fillna(False)


In [69]:
def map_affiliation_id(affiliation_df, university_board_statistics_df):
    count_updates = 0  # Counter for the number of updates

    for index, row in affiliation_df.iterrows():
        institution = row['FullName']
        affiliation_id = row['AffiliationId']
        
        # Get the number of matching rows before updating
        matching_rows = university_board_statistics_df[
            (university_board_statistics_df['Institution'] == institution) & 
            (university_board_statistics_df['AffiliationId'].isna())
        ]
        
        # Update AffiliationId if matches are found
        if not matching_rows.empty:
            count_updates += len(matching_rows)
            university_board_statistics_df.loc[
                (university_board_statistics_df['Institution'] == institution) & 
                (university_board_statistics_df['AffiliationId'].isna()), 'AffiliationId'
            ] = affiliation_id

    # Print the total number of updates made
    print(f"Total AffiliationId entries updated: {count_updates}")

    return university_board_statistics_df

In [70]:
#combined a couple different scripts here so the variable names are slightly changed and makes code a bit messy 

university_board_statistics = university_board_statistics_df
state_system_df = pd.read_csv(state_path)

# Set default value of StateSystem to False
university_board_statistics['StateSystem'] = False

for index, row in state_system_df.iterrows():
    if pd.notna(row['StateSystem']):
        institution = row['Institution']
        affiliation_id = row['AffiliationId']
        state_system_value = row['StateSystem']
        
        university_board_statistics.loc[
            (university_board_statistics['Institution'] == institution) | 
            (university_board_statistics['AffiliationId'] == affiliation_id), 
            'StateSystem'
        ] = state_system_value

# university_board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)

  university_board_statistics.loc[


In [71]:
for year in years:
    # Load the data for the current year
    boards_df = pd.read_csv(f"{boards_path}{year}_boards_normalized.csv")
    
    # Group by Institution and aggregate unique Name values into sets
    grouped = boards_df.groupby('Institution')['Name'].apply(set).reset_index()

    # Track processed institutions and new state system rows
    processed_institutions = set()
    new_state_system_rows = []

    # Create a mapping of name sets (as tuples) to lists of institutions that share them
    name_to_institutions = {}

    for i, group in grouped.iterrows():
        institution = group['Institution']
        names_set = tuple(group['Name'])  # Convert set to a tuple to make it hashable
        
        # Group institutions by their name sets
        if names_set not in name_to_institutions:
            name_to_institutions[names_set] = []
        name_to_institutions[names_set].append(institution)

    # Process each group of institutions that share the same name set
    for names_set, institutions in name_to_institutions.items():
        if len(institutions) > 1:  # Only consider groups with duplicates
            processed_institutions.update(institutions)
            
            # Check if any of the institutions have a state system value
            is_state_system = university_board_statistics.loc[
                (university_board_statistics['Institution'].isin(institutions)) & 
                (university_board_statistics['StateSystem'].notna()) &
                (university_board_statistics['StateSystem'] != False)
            ].any().any()

            if is_state_system:
                # Extract the StateSystem value
                state_system_name = university_board_statistics.loc[
                    (university_board_statistics['Institution'].isin(institutions)) & 
                    (university_board_statistics['StateSystem'].notna()) &
                    (university_board_statistics['StateSystem'] != False), 'StateSystem'
                ].values[0]

                # Use one of the institutions' diversity statistics
                source_stats = university_board_statistics.loc[
                    (university_board_statistics['Institution'].isin(institutions)) & 
                    (university_board_statistics['Year'] == int(year))
                ]

                if not source_stats.empty:
                    source_stats = source_stats.iloc[0]
                    new_row = {
                        'Year': int(year),
                        'Institution': state_system_name,
                        'carnegie_id': np.nan,
                        'AffiliationId': np.nan,
                        'female_president': source_stats['female_president'],
                        'female': source_stats['female'],
                        'male': source_stats['male'],
                        'board_turnover': source_stats['board_turnover'],
                        'unknown': source_stats['unknown'],
                        'unknown_change': source_stats['unknown_change'],
                        'male_change': source_stats['male_change'],
                        'female_change': source_stats['female_change'],
                        'white': source_stats['white'],
                        'poc': source_stats['poc'],
                        'unknown_eth': source_stats['unknown_eth'],
                        'white_change': source_stats['white_change'],
                        'poc_change': source_stats['poc_change'],
                        'unknown_eth_change': source_stats['unknown_eth_change'],
                        'StateSystem': True
                    }
                    new_state_system_rows.append(new_row)

    # Remove duplicate entries from the university statistics
    university_board_statistics = university_board_statistics[
        ~((university_board_statistics['Institution'].isin(processed_institutions)) & 
          (university_board_statistics['Year'] == int(year)))
    ]

    # Append new state system rows, if any
    if new_state_system_rows:
        new_rows_df = pd.DataFrame(new_state_system_rows)
        university_board_statistics = pd.concat([university_board_statistics, new_rows_df], ignore_index=True)
        print(f"Replaced {len(processed_institutions)} duplicate entries with state system rows for year {year}.")

# Save the updated dataframe to CSV
university_board_statistics = university_board_statistics.sort_values(by = ["Year", "Institution"])
# university_board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)
print("Updated university_board_statistics saved.")


Updated university_board_statistics saved.


In [72]:
# -- Refactored Code --
board_statistics = university_board_statistics


# 1. Remove any existing 'carnegie_id', 'state', 'control' columns
board_statistics.drop(columns=['carnegie_id', 'state', 'control'], errors='ignore', inplace=True)

# 2. Merge 'carnegie_id' from carnegie_map based on 'AffiliationId'
board_statistics = board_statistics.merge(
    carnegie_map[['AffiliationId', 'carnegie_id']], 
    on='AffiliationId', 
    how='left'
)
# 3. Merge 'state' and 'control' from classification_map based on 'carnegie_id'
board_statistics = board_statistics.merge(
    classification_map[['unitid', 'state', 'control']], 
    left_on='carnegie_id', 
    right_on='unitid', 
    how='left', 
    suffixes=('', '_classification')
)

# 4. Drop the redundant 'unitid' column
board_statistics.drop(columns='unitid', inplace=True)

# 5. Rename columns if suffixes were added
board_statistics.rename(
    columns={'state_classification': 'state', 'control_classification': 'control'},
    inplace=True
)

# 6. Remove any columns that start with "Unnamed"
board_statistics = board_statistics.loc[:, ~board_statistics.columns.str.contains('^Unnamed')]

# Final DataFrame Verification
print("\nMerged board_statistics DataFrame:")
print(board_statistics)

# Check for (and confirm removal of) unnamed columns
unnamed_columns = [col for col in board_statistics.columns if 'Unnamed' in col]
if unnamed_columns:
    print("\nUnnamed Columns Detected and Removed:")
    print(unnamed_columns)
else:
    print("\nNo Unnamed Columns Detected.")


board_statistics['control'] = board_statistics['control'].apply(
    lambda x: 'Private' if 'Private' in str(x) else 'Public' if 'Public' in str(x) else x
)



Merged board_statistics DataFrame:
      Year                      Institution  AffiliationId  female_president  \
0     1999               Adelphi University     71965598.0             False   
1     1999              American University    181401687.0             False   
2     1999               Andrews University    102298084.0             False   
3     1999         Arizona Board of Regents     55732556.0             False   
4     1999                Auburn University     82497590.0             False   
...    ...                              ...            ...               ...   
1727  2018      Western Michigan University    141649380.0             False   
1728  2018               Widener University    138659443.0              True   
1729  2018  Worcester Polytechnic Institute    107077323.0              True   
1730  2018                  Yale University     32971472.0             False   
1731  2018               Yeshiva University     19772626.0             False   

   

In [73]:
state_to_region = {
    # Northeast
    "CT": "Northeast", "ME": "Northeast", "MA": "Northeast", "NH": "Northeast",
    "RI": "Northeast", "VT": "Northeast", "NJ": "Northeast", "NY": "Northeast", 
    "PA": "Northeast", "DC": "Northeast",
    
    # Midwest
    "IL": "Midwest", "IN": "Midwest", "IA": "Midwest", "KS": "Midwest",
    "MI": "Midwest", "MN": "Midwest", "MO": "Midwest", "NE": "Midwest", 
    "ND": "Midwest", "OH": "Midwest", "SD": "Midwest", "WI": "Midwest",
    
    # South
    "AL": "South", "AR": "South", "DE": "South", "FL": "South",
    "GA": "South", "KY": "South", "LA": "South", "MD": "South",
    "MS": "South", "NC": "South", "OK": "South", "SC": "South",
    "TN": "South", "TX": "South", "VA": "South", "WV": "South",
    
    # West
    "AK": "West", "AZ": "West", "CA": "West", "CO": "West",
    "HI": "West", "ID": "West", "MT": "West", "NV": "West",
    "NM": "West", "OR": "West", "UT": "West", "WA": "West",
    "WY": "West"
}
# board_statistics["regoio"]
board_statistics['region'] = board_statistics['state'].map(state_to_region)
# board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)


In [74]:
# Example dictionary: full state names to abbreviations.
state_to_abbreviation = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY"
}

def update_region_from_institution(row):
    """
    Scans the 'Institution' field for any full state name (case-insensitive).
    If found, uses state_to_abbreviation to get the state abbreviation and then
    state_to_region to get the region. If no match is found, returns the existing
    value of 'region' (or 'Unknown' if not present).
    """
    institution = str(row['Institution'])
    for full_state, abbrev in state_to_abbreviation.items():
        # Check if the full state name appears in the institution name.
        if full_state.lower() in institution.lower():
            # Return the region from the state_to_region mapping.
            return state_to_region.get(abbrev, "Unknown")
    # If no state name is found, return the current region value (or default to "Unknown")
    return row.get('region', "Unknown")

def update_region_from_system(row):
    """
    Scans the 'Institution' field for any full state name (case-insensitive).
    If found, uses state_to_abbreviation to get the state abbreviation and then
    state_to_region to get the region. If no match is found, returns the existing
    value of 'region' (or 'Unknown' if not present).
    """
    institution = str(row['StateSystem'])
    for full_state, abbrev in state_to_abbreviation.items():
        # Check if the full state name appears in the institution name.
        if full_state.lower() in institution.lower():
            # Return the region from the state_to_region mapping.
            return state_to_region.get(abbrev, "Unknown")
    # If no state name is found, return the current region value (or default to "Unknown")
    return row.get('region', "Unknown")


def update_control_from_system(row):
    """
    If the 'control' field is empty (or NaN) and the 'StateSystem' field
    indicates a state system (i.e. it is not empty and not "False"), then 
    update control to "Public". Otherwise, leave the current control value.
    """
    current_control = row.get('control', "")
    # Check if control is missing or empty.
    if pd.isna(current_control) or str(current_control).strip() == "":
        state_sys = str(row.get('StateSystem', "")).strip()
        # If StateSystem is not empty and not "False" (ignoring case), assume it's a state system.
        if state_sys and state_sys.lower() != "false":
            return "Public"
    # Otherwise, return the current control value.
    return current_control

    

# Apply the function row-wise to update the 'region' column.
board_statistics['region'] = board_statistics.apply(update_region_from_institution, axis=1)
board_statistics['region'] = board_statistics.apply(update_region_from_system, axis=1)
board_statistics['control'] = board_statistics.apply(update_control_from_system, axis = 1)

# Optionally, inspect the updated columns.
print(board_statistics[['Institution', 'region']].head())
board_statistics = board_statistics.drop_duplicates()


                Institution     region
0        Adelphi University  Northeast
1       American University  Northeast
2        Andrews University    Midwest
3  Arizona Board of Regents       West
4         Auburn University      South


In [75]:
all_board_members = []
for year in years:
    board_df_path = f"{boards_path}{year}_boards_normalized.csv"
    df = pd.read_csv(board_df_path)
    all_board_members.append(df)

all_board_members_df = pd.concat(all_board_members, ignore_index=True)
all_board_members_df = all_board_members_df.drop_duplicates()

all_board_names = all_board_members_df["Name"].values

In [76]:
billionaire_df = pd.read_csv(billionaires_path)
billionaire_names = billionaire_df["full_name"].values
billionaire_names = set([value.replace("& family", "") for value in billionaire_names])


In [77]:
similarity_threshold = 90

# Preprocess billionaire names
billionaire_names = [name.strip().lower() for name in billionaire_names]

# Load university_board_statistics
university_board_statistics = board_statistics

university_board_statistics['num_billionaires'] = 0


# Create a temporary normalized column for matching
university_board_statistics['Institution_normalized'] = university_board_statistics['Institution'].str.strip().str.lower()
university_board_statistics['Year'] = university_board_statistics['Year'].astype(int)

# Ensure 'num_billionaires' is numeric
university_board_statistics['num_billionaires'] = pd.to_numeric(
    university_board_statistics['num_billionaires'], errors='coerce').fillna(0).astype(int)

billionaire_board_members = []

for year in years:
    print(f"\nProcessing year: {year}")
    board_df_path = f"{boards_path}{year}_boards_normalized.csv"

    # Check if the board file exists
    try:
        board_df = pd.read_csv(board_df_path)
        print(f"Loaded board data for year {year}: {board_df.shape[0]} records.")
    except FileNotFoundError:
        print(f"Error: File {board_df_path} does not exist. Skipping year {year}.")
        continue

    # Normalize 'Name' and 'Institution' in board_df
    board_df['Name'] = board_df['Name'].str.strip().str.lower()
    board_df['Institution_normalized'] = board_df['Institution'].str.strip().str.lower()

    # Create a lookup dictionary for institution by name
    name_to_institution = dict(zip(board_df['Name'], board_df['Institution_normalized']))

    # Get all board member names
    all_board_names = board_df['Name'].tolist()

    counts_per_institution = defaultdict(int)

    # Match each board member to billionaire names
    for member in all_board_names:
        result = process.extractOne(member, billionaire_names, scorer=fuzz.ratio, score_cutoff=similarity_threshold)
        if result is not None:
            match, score, _ = result
            billionaire_board_members.append((member, match, score, year))
            
            institution_name = name_to_institution.get(member)
            if institution_name:
                counts_per_institution[institution_name] += 1

    print(counts_per_institution)

    # Instead of using masks, we do direct lookups.
    for institution, count in counts_per_institution.items():
        # Flag to check if we found a match
        found_match = False

        # Iterate over each row and compare year and institution directly
        for idx in university_board_statistics.index:
            row_year = university_board_statistics.at[idx, 'Year']
            row_institution = university_board_statistics.at[idx, 'Institution_normalized']

            if str(row_year) == str(year) and str(row_institution) == str(institution):
                # Increment num_billionaires for this row
                current_value = university_board_statistics.at[idx, 'num_billionaires']
                university_board_statistics.at[idx, 'num_billionaires'] = current_value + count
                found_match = True

        # if not found_match:
        #     print(f"Warning: No matching rows found for Year: {year}, Institution: {institution}")
        # else:
        #     print(f"Updated rows for Year: {year}, Institution: {institution} with count {count}")

    counts_per_institution.clear()

# Drop the temporary normalized column
university_board_statistics.drop(columns=['Institution_normalized'], inplace=True)

# Final Summary
print(f"\nTotal number of board members who are billionaires (similarity ≥ {similarity_threshold}%): {len(billionaire_board_members)}")
for board_member, matched_billionaire, similarity, year in billionaire_board_members:
    print(f"Year: {year}, Board Member: {board_member}, Matched Billionaire: {matched_billionaire}, Similarity: {similarity}%")




Processing year: 1999
Loaded board data for year 1999: 4941 records.
defaultdict(<class 'int'>, {'andrews university': 1, 'california institute of technology': 2, 'carnegie mellon university': 1, 'chapman university': 1, 'cornell university': 1, 'creighton university': 2, 'emory university': 1, 'george washington university': 1, 'new york university': 4, 'georgetown university': 1, 'johns hopkins university': 2, 'massachusetts institute of technology': 1, 'miami university': 1, 'rochester institute of technology': 1, 'stanford university': 1, 'temple university': 1, 'thomas jefferson university': 1, 'tufts university': 1, 'university of dayton': 1, 'university of miami': 1, 'university of rochester': 1, 'university of southern california': 4, 'university of st thomas minnesota': 2, 'yeshiva university': 2})

Processing year: 2000
Loaded board data for year 2000: 5153 records.
defaultdict(<class 'int'>, {'andrews university': 1, 'baylor university': 1, 'california institute of technolo

In [78]:
# all_nodes_df = pd.DataFrame()  
# all_nodes_list = []        
# all_interlocks_list = []   


# def remove_non_samples(df):
#     return df[df['PrimarySample'] == True]

# board_statistics_df = university_board_statistics


# def group_institutions_by_membership(institution_to_members: dict, threshold: float) -> list:
#     """
#     Given a dictionary {institution -> set_of_members}, return a list of groups (lists)
#     of institutions that have an overlap ratio (intersection divided by the smaller board's size)
#     >= threshold.

#     Used to not inflate the number of interlocks when we observe interlocks between two state schools that 
#     operate under the same board. 
#     Defining the correct threshold is difficult, should probably be lower, as it's not counting boards that shouldnt be
#     """
    
#     institutions = list(institution_to_members.keys())
#     n = len(institutions)
#     adjacency = defaultdict(list)
#     for i in range(n):
#         for j in range(i + 1, n):
#             inst_i, inst_j = institutions[i], institutions[j]
#             members_i = institution_to_members[inst_i]
#             members_j = institution_to_members[inst_j]
#             if not members_i or not members_j:
#                 continue
#             intersection_size = len(members_i.intersection(members_j))
#             smaller_board_size = min(len(members_i), len(members_j))
#             overlap_ratio = intersection_size / smaller_board_size
#             if overlap_ratio >= threshold:
#                 adjacency[inst_i].append(inst_j)
#                 adjacency[inst_j].append(inst_i)
#     visited = set()
#     groups = []
#     for inst in institutions:
#         if inst not in visited:
#             stack = [inst]
#             group = []
#             while stack:
#                 current = stack.pop()
#                 if current not in visited:
#                     visited.add(current)
#                     group.append(current)
#                     for neighbor in adjacency[current]:
#                         if neighbor not in visited:
#                             stack.append(neighbor)
#             groups.append(sorted(group))
#     return [g for g in groups if len(g) > 1]

# def lookup_female_president(row, year):
#     """
#     Look up the 'female_president' boolean from board_statistics_df for a given row and year.
    
#     The function first filters board_statistics_df by the specified year, then attempts to
#     match either on 'AffiliationId' (using row['AffiliationId']) or on 'Institution' (using row['Id']).
#     It returns the boolean value from the 'female_president' column.
    
#     Raises:
#         ValueError: If no matching record is found.
#     """
#     filtered_df = board_statistics_df[board_statistics_df['Year'] == int(year)]
#     if row["Id"] in filtered_df["Institution"].values:
#         matching_row = filtered_df[filtered_df["Institution"] == row["Id"]].iloc[0]
#         if matching_row['female_president'] == True:
#             return True
#         else:
#             return False
#     elif row["AffiliationId"] in filtered_df["AffiliationId"].values:
#         matching_row = filtered_df[filtered_df["AffiliationId"] == row["AffiliationId"]].iloc[0]
#         if matching_row['female_president'] == True:
#             return True
#         else:
#             return False
#     return "Unknown"

# def lookup_column(row, column_name):
#     """
#     look up any column (e.g., 'control', 'region') in board_statistics_df
#     using AffiliationId or Institution name.
#     """
#     matching_rows = board_statistics_df[
#         board_statistics_df['AffiliationId'] == row['AffiliationId']
#     ]
#     if not matching_rows.empty:
#         return matching_rows[column_name].mode()[0]
#     matching_rows = board_statistics_df[
#         board_statistics_df['Institution'] == row['Id']
#     ]
#     if not matching_rows.empty:
#         return matching_rows[column_name].mode()[0]
#     return 'unknown'

# # ------------------------------------------------------------------------------
# # Name Cleaning and Canonicalization (without fuzzy matching)
# # ------------------------------------------------------------------------------
# substrings_to_remove = [
#     "Rev.", "SJ", "Sister", "Brother", "Father", "OP", "The Very",
#     "Sr.", "O.P.", "Very Rev.", "Br.", "Dr.", "Md.", "S.J.", "Very Rev",
#     "M.D.", "O.P", "S.J", "J.R", "Jr.", "Jr ", "III"
# ]

# def clean_name(raw_name: str) -> str:
#     """
#     Clean and canonicalize a board member's name by:
#       - Removing specified title substrings (case-insensitive)
#       - Removing punctuation and extra whitespace
#       - Converting to title case.
#     """
#     for title in substrings_to_remove:
#         title_clean = title.strip()
#         raw_name = re.sub(r'\b' + re.escape(title_clean) + r'\b', '', raw_name, flags=re.IGNORECASE)
#     raw_name = re.sub(r'[^\w\s]', '', raw_name)
#     cleaned_name = " ".join(raw_name.split())
#     return cleaned_name.title()

# # Global edge counter so that edge IDs remain unique across years (when building the combined network)
# edge_id_counter = 1


# '''
# Compute network by year for network and for centrality measures in the regression
# Compute entire network for global network
# '''
# for year in years:
#     print(f"Processing year: {year}")
#     board_member_dict = defaultdict(set)
    

#     boards_path = f"{absolute_path}{board_dataframes}{year}_boards.csv"
#     double_boards_path = f"{absolute_path}{board_dataframes}{year}_double_board.csv"
    
#     boards_df = pd.read_csv(boards_path)
#     double_boards_df = pd.read_csv(double_boards_path)
#     boards_df = remove_non_samples(boards_df)
#     double_boards_df = remove_non_samples(double_boards_df)
    
#     #Map board members to their institution
#     institution_to_members = defaultdict(set)
#     for _, row in boards_df.iterrows():
#         institution_to_members[row['Institution']].add(row['Name'])
#     for _, row in double_boards_df.iterrows():
#         institution_to_members[row['Institution']].add(row['Name'])
    
#     #Board sizes for edge weights
#     board_sizes = {inst: len(members) for inst, members in institution_to_members.items()}
    
#     '''
#     #Identify identical board groups for state systems (not going to right now)
#     threshold = 0.3
#     identical_board_groups = group_institutions_by_membership(institution_to_members, threshold)
#     print(f"  Found {len(identical_board_groups)} identical board group(s) with threshold={threshold}.")
#     for i, g in enumerate(identical_board_groups, start=1):
#         print(f"    Group {i}: {g}")
    
#     # Map each institution to its group index.
#     institution_to_group = {}
#     for idx, group in enumerate(identical_board_groups):
#         for inst in group:
#             institution_to_group[inst] = idx

#     def is_same_group(inst1, inst2):
#         """Return True if both institutions belong to the same identical board group."""
#         return (inst1 in institution_to_group and inst2 in institution_to_group and 
#                 institution_to_group[inst1] == institution_to_group[inst2])
#     '''
    


#     '''
#     Create the network for the year that's being iterated over
#     '''
#     edge_accum = {}  # Key: tuple(sorted([inst1, inst2])); Value: dict with keys: Id, Source, Target, Weight, Year
#     year_nodes_dict = defaultdict(lambda: {'Interlock_Count': 0, 'AffiliationId': None})
    
#     # Context for tracking counts and generating unique edge IDs.
#     context = {
#         'excluded_interlocks_count': 0,
#         'total_interlocks': 0,
#         'edge_id_counter': edge_id_counter
#     }
#     # To avoid duplicate contributions from the same board member within a year.
#     created_interlocks = defaultdict(set)
    
#     def process_board_row(row, ctx):
#         """
#         Process a board row (using raw names) to update interlocks.
#         Skip the row if "vacant" is in the board member's name.
#         For each board member row, for every previous institution that this person has served on,
#         create (or update) an edge whose weight is computed as:
#             w = 1 / min(board_sizes[prev_institution], board_sizes[current_institution])
#         The weight is added to the edge for this year (edges are stored per year).
#         """

#         #some names are listed as vacant, cannot count that as an interlock
#         if "vacant" in row['Name'].lower():
#             return
        
#         name = row['Name'] 
#         name = clean_name(row['Name'])
#         institution = row['Institution']
#         affiliation_id = row['AffiliationId']
        
#         for prev_institution in board_member_dict[name]:
#             if prev_institution == institution:
#                 continue
#             # if is_same_group(prev_institution, institution):
#             #     ctx['excluded_interlocks_count'] += 1
#             #     continue
#             pair = tuple(sorted([prev_institution, institution]))
#             # Avoid duplicate contributions from the same board member.
#             if pair in created_interlocks[name]:
#                 continue
#             created_interlocks[name].add(pair)
            
#             # Calculate weight contribution as 1 divided by the size of the smaller board.
#             w = 1 
            
#             # Since edges are calculated per year, if this pair already exists, update its weight.
#             if pair in edge_accum:
#                 edge_accum[pair]['Weight'] += w
#             else:
#                 edge_id = f"e{ctx['edge_id_counter']}"
#                 ctx['edge_id_counter'] += 1
#                 edge_accum[pair] = {
#                     'Id': edge_id,
#                     'Source': pair[0],
#                     'Target': pair[1],
#                     'Type': 'Undirected',
#                     'Weight': w,
#                     'Year': year
#                 }
#             ctx['total_interlocks'] += 1
#             # Sum the contributions for the nodes.
#             year_nodes_dict[prev_institution]['Interlock_Count'] += w
#             year_nodes_dict[institution]['Interlock_Count'] += w
        
#         board_member_dict[name].add(institution)
#         if year_nodes_dict[institution]['AffiliationId'] is None:
#             year_nodes_dict[institution]['AffiliationId'] = affiliation_id

#     for _, row in boards_df.iterrows():
#         process_board_row(row, context)
#     for _, row in double_boards_df.iterrows():
#         process_board_row(row, context)
    
#     excluded_interlocks_count = context['excluded_interlocks_count']
#     total_interlocks = context['total_interlocks']
#     edge_id_counter = context['edge_id_counter']
    
#     #add weights to the edges
#     for pair, edge in edge_accum.items():
#         shared_members = edge['Weight']
#         total_unique = board_sizes.get(pair[0], 0) + board_sizes.get(pair[1], 0) - shared_members
#         edge['Weight'] = shared_members / total_unique if total_unique > 0 else 0

#     #nodes df for the current year
#     nodes_df = pd.DataFrame(
#         [(inst, data['Interlock_Count'], data['AffiliationId'])
#          for inst, data in year_nodes_dict.items()],
#         columns=['Id', 'Interlock_Count', 'AffiliationId']
#     )
#     nodes_df['Label'] = nodes_df['Id']
    
#     #lookup extra attributes from board_statistics_df.
#     nodes_df['female_president'] = nodes_df.apply(lambda row: lookup_female_president(row, year), axis=1)
#     nodes_df['control'] = nodes_df.apply(lambda row: lookup_column(row, 'control'), axis=1)
#     nodes_df['region'] = nodes_df.apply(lambda row: lookup_column(row, 'region'), axis=1)
    
#     nodes_df = nodes_df[['Id', 'Label', 'Interlock_Count', 'AffiliationId',
#                          'female_president', 'control', 'region']]
    
#     # Convert the accumulated edges dictionary to a DataFrame.
#     edges_df = pd.DataFrame(list(edge_accum.values()))
#     edges_df = edges_df[['Id', 'Source', 'Target', 'Type', 'Weight', 'Year']]

#     print(f"Institutions: {len(year_nodes_dict)} | Total interlocks: {total_interlocks}")
    
#     '''
#     #for the thresholding (not used right now)
#     if total_interlocks + excluded_interlocks_count > 0:
#         proportion_excluded = (excluded_interlocks_count / (total_interlocks + excluded_interlocks_count)) * 100
#     else:
#         proportion_excluded = 0.0
#     print(f"  Institutions: {len(year_nodes_dict)} | Total interlocks: {total_interlocks} | Excluded interlocks: {excluded_interlocks_count} ({proportion_excluded:.2f}%)")
#     '''
    
#     #make network and get centrality measures for the current year
#     G = nx.Graph()
#     for _, node_row in nodes_df.iterrows():
#         G.add_node(
#             node_row["Id"],
#             AffiliationId=node_row["AffiliationId"],
#             Label=node_row["Label"],
#             Interlock_Count=node_row["Interlock_Count"],
#             female_president=node_row["female_president"],
#             control=node_row["control"],
#             region=node_row["region"]
#         )
#     for _, edge_row in edges_df.iterrows():
#         G.add_edge(edge_row["Source"], edge_row["Target"], weight=edge_row.get("Weight", 1))
    
#     betweenness = nx.betweenness_centrality(G, weight="weight")
#     closeness = nx.closeness_centrality(G)
#     eigenvector = nx.eigenvector_centrality(G, max_iter=1000, weight="weight")
#     degree_dict = dict(G.degree())
#     strength_dict = dict(G.degree(weight='weight'))
#     clustering = nx.clustering(G, weight="weight")
    
#     nodes_df["betweenness"] = nodes_df["Id"].map(betweenness)
#     nodes_df["closeness"] = nodes_df["Id"].map(closeness)
#     nodes_df["eigenvector"] = nodes_df["Id"].map(eigenvector)
#     nodes_df["degree"] = nodes_df["Id"].map(degree_dict)
#     nodes_df["clustering"] = nodes_df["Id"].map(clustering)
#     nodes_df["strength"] = nodes_df["Id"].map(strength_dict)
    
#     # Add a 'Year' column for merging later.
#     nodes_df["Year"] = year
#     all_nodes_list.append(nodes_df)
#     all_nodes_df = pd.concat([all_nodes_df, nodes_df], ignore_index=True)
    
#     if not edges_df.empty:
#         all_interlocks_list.append(edges_df)

# #Merge the centrality measures from the yearly networks into the university_board_statistics df
# board_statistics_df['Year'] = board_statistics_df['Year'].astype(str)
# merged_df = board_statistics_df.merge(
#     all_nodes_df[["Year", "AffiliationId", "betweenness", "closeness", "eigenvector", "degree", "strength", "clustering"]],
#     on=["Year", "AffiliationId"],
#     how="left",
#     suffixes=("", "_new")
# )
# for col in ["betweenness", "closeness", "eigenvector", "degree", "strength", "clustering", "Board_Size"]:
#     if f"{col}_new" in merged_df.columns:
#         merged_df[col] = merged_df[f"{col}_new"]
#         merged_df.drop(columns=[f"{col}_new"], inplace=True)

# centrality_columns = ["betweenness", "closeness", "eigenvector", "degree", "strength", "clustering"]

# #write to csv and fill in empty values (when the inst wasn't in the network, the centrality measures are nan)
# merged_df[centrality_columns] = merged_df[centrality_columns].fillna(0)
# university_board_statistics_df = merged_df



In [79]:
college_matching_path = f"{absolute_path}{college_matching}"
university_boards_statistics_path = f"{absolute_path}{final_scripts}{normalized_dataframes}normalized_university_board_statistics.csv"
university_admissions_path = f"{college_matching_path}university_admission_rate.csv"
university_demographics_path = f"{college_matching_path}university_student_demographics.csv"
university_faculty_path = f"{college_matching_path}university_faculty.csv"

university_admissions_df = pd.read_csv(university_admissions_path)
university_demographics_df = pd.read_csv(university_demographics_path)
university_faculty_df = pd.read_csv(university_faculty_path)


In [80]:
# Ensure the Year column is the same type in all DataFrames
university_board_statistics_df = university_board_statistics
university_board_statistics_df['Year'] = university_board_statistics_df['Year'].astype(int)
university_admissions_df['Year'] = university_admissions_df['year'].astype(int)
university_demographics_df['Year'] = university_demographics_df['year'].astype(int)
university_faculty_df['Year'] = university_faculty_df['year'].astype(int)


university_board_statistics_df = university_board_statistics_df.merge(
    university_admissions_df, 
    on=["Year", "AffiliationId"], 
    how="left"
)

# Merge demographics data into board stats
university_board_statistics_df = university_board_statistics_df.merge(
    university_demographics_df, 
    on=["Year", "AffiliationId"], 
    how="left"
)

# Merge demographics data into board stats
# university_faculty_df = university_faculty_df.rename(columns={"year": "Year"})
university_board_statistics_df = university_board_statistics_df.merge(
    university_faculty_df[['Year', 'AffiliationId','student.demographics.faculty.women']], 
    on=["Year", "AffiliationId"], 
    how="left"
)


university_board_statistics_df.to_csv(university_boards_statistics_path, index = False)
print("\nUpdated university_board_statistics saved successfully.")




Updated university_board_statistics saved successfully.
