In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from matplotlib.lines import Line2D
from nameparser import HumanName
import gender_guesser.detector as gender
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import ethnicolr
from ethnicolr import pred_census_ln
import re

In [39]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"
college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"
college_matching_path = f"{absolute_path}{college_matching}"

# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

diversity_statistics_path = f"{altered_dataframe_path}diversity_statistics.csv"
carnegie_map = pd.read_csv(f"{college_matching_path}carnegie_map_openalex.csv")
board_statistics = pd.read_csv(f"{altered_dataframe_path}sample_board_statistics.csv")
classification_map = pd.read_csv(f"{college_matching_path}cc_download.csv")
state_path = f"{temporary_data_path}state_systems_validated.csv"   

In [40]:
def clean_name(full_name):
    """
    Removes specified substrings and patterns from a full name.

    Args:
        full_name (str): The full name to clean.

    Returns:
        str: The cleaned full name.
    """
    if not isinstance(full_name, str):  # Ensure the input is a string
        return "Unknown"

    # Substrings to remove
    substrings_to_remove = ["Rev.", "SJ", "Sister", "Brother", "Father", "OP", "The Very", "Sr.", "O.P.","Very Rev.", "Br.", "Dr.", "Md.", "S.J.", "Very Rev", "M.D.", "O.P", "S.J", "J.R", "Jr.", "Jr ", "III"]

    # Remove specified substrings
    for substring in substrings_to_remove:
        full_name = full_name.replace(substring, "")

    # Remove any capital letter followed by a period and a space (e.g., "J. ")
    full_name = re.sub(r'\b[A-Z]\. ', '', full_name)

    # Strip extra spaces
    return full_name.strip()

def get_last_name(full_name):
    """
    Extracts the last name from a full name or returns the full name if extraction fails.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The last name, or the full name if last name extraction fails.
    """
    full_name = clean_name(full_name)
    if not full_name or not full_name.strip():  # Check for None or empty strings
        return str(full_name)
    full_name = HumanName(full_name)
    return str(full_name.last) if full_name.last else str(full_name)

def get_first_name(full_name):
    """
    Extracts the first name from a full name or returns the full name if extraction fails.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The first name, or the full name if first name extraction fails.
    """
    full_name = clean_name(full_name)
    if not full_name or not full_name.strip():  # Check for None or empty strings
        return str(full_name)
    full_name = HumanName(full_name)
    return str(full_name.first) if full_name.first else str(full_name)

# Classifies ethnicity into 'white', 'poc', or 'unknown'
def classify_ethnicity(ethnicity):
    """
    Classifies ethnicity into 'white', 'poc', or 'unknown'.

    Args:
        ethnicity (str): The ethnicity label.

    Returns:
        str: The classified ethnicity group.
    """
    if ethnicity == 'white':
        return 'white'
    elif ethnicity in ['api', 'black', 'hispanic', 'aian']:
        return 'poc'
    else:
        return 'unknown'

# Classifies gender into 'male', 'female', or 'unknown'
def classify_gender(gender_str):
    """
    Classifies gender into 'male', 'female', or 'unknown'.

    Args:
        gender_str (str): The raw gender prediction string.

    Returns:
        str: The classified gender group.
    """
    if gender_str in ['male', 'mostly_male']:
        return 'male'
    elif gender_str in ['female', 'mostly_female']:
        return 'female'
    else:
        return 'unknown'

# Predicts gender based on first names and adds a 'gender' column to the dataframe
def pred_gender_fn(df, first_name_col):
    """
    Predicts gender based on first names and classifies it.

    Args:
        df (pd.DataFrame): The dataframe containing the first name column.
        first_name_col (str): The column name containing first names.

    Returns:
        pd.DataFrame: The dataframe with a new 'gender' column.
    """
    d = gender.Detector()
    df['gender'] = df[first_name_col].apply(lambda x: d.get_gender(x))
    df['gender'] = df['gender'].apply(classify_gender)
    return df


def pred_census_ln_fn(df, last_name_col, year=2000):
    """
    Predicts ethnicity based on last names and classifies it.

    Args:
        df (pd.DataFrame): The dataframe containing the last name column.
        last_name_col (str): The column name containing last names.
        year (int): The year of the Census model to use for predictions.

    Returns:
        pd.DataFrame: The dataframe with a new 'ethnicity' column.
    """
    # Ensure the last name column contains valid strings
    df[last_name_col] = df[last_name_col].fillna("Unknown").astype(str)

    # Sanitize the last_name_col: Remove non-alphanumeric characters and trim whitespace
    df[last_name_col] = df[last_name_col].apply(lambda x: ''.join(filter(str.isalnum, x)) if isinstance(x, str) else "Unknown")

    # Use Ethnicolr to predict ethnicity
    try:
        df = pred_census_ln(df, last_name_col, year=year)
    except ValueError as e:
        print(f"Error in Ethnicolr prediction: {e}")
        raise

    # Extract the raw ethnicity prediction column
    ethnicity_col = f"{last_name_col}_pred_census_{year}"
    if ethnicity_col not in df.columns:
        raise ValueError(f"Expected column '{ethnicity_col}' missing after Ethnicolr prediction.")

    # Create the 'ethnicity' column and classify it
    df['ethnicity'] = df[ethnicity_col].apply(classify_ethnicity)

    # Drop the raw prediction column to keep only the classified 'ethnicity'
    df.drop(columns=[ethnicity_col], inplace=True)

    return df


def classify_president_genders(valid_years, altered_dataframe_path):
    """
    Classifies gender for each president in the specified years and adds a 'female_president' column.

    Args:
        valid_years (list): A list of years to process.
        altered_dataframe_path (str): The path to the directory containing president CSV files.

    Returns:
        dict: A dictionary with 'Institution' as keys and 'female_president' as boolean values.
    """
    president_data = {}

    for year in valid_years:
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract first names and predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['gender'] = president_df['first_name'].apply(lambda x: gender.Detector().get_gender(x))
        president_df['gender'] = president_df['gender'].apply(classify_gender)

        # Add 'female_president' column
        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)

        # Store results in a dictionary by Institution
        for _, row in president_df.iterrows():
            president_data[row['Institution']] = row['female_president']

    return president_data



def process_board_data():
    """
    Processes board data for each year to compute gender counts using both board_df and double_board_df,
    predicts president gender, and adds 'female_president', 'AffiliationId', and 'PrimarySample' columns.
    Retains original columns and adds membership counts, including 'total_members' and yearly changes.

    Returns:
        pd.DataFrame: A DataFrame with original columns, yearly change statistics, and added
                      president/affiliation data (for gender only).
    """
    numbers_list = []

    for year in years:
        # Load the president data for the current year
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract names & predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['last_name'] = president_df['Name'].apply(get_last_name)
        president_df = pred_gender_fn(president_df, 'first_name')

        # Mark female_president
        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)

        # Remove duplicates so each Institution is unique
        president_df = president_df.drop_duplicates(subset='Institution').reset_index(drop=True)

        # Create dictionary mapping institutions -> {female_president}
        president_data = president_df.set_index('Institution')[['female_president']].to_dict('index')

        # Load the board data for the current year
        board_df_path = f"{boards_path}{year}_boards.csv"
        board_df = pd.read_csv(board_df_path)

        # Load the double board data for the current year
        double_board_df_path = f"{boards_path}{year}_double_board.csv"
        double_board_df = pd.read_csv(double_board_df_path)

        # Function to preprocess and compute gender counts
        def preprocess_and_count(df):
            # Extract names & predict gender
            df['first_name'] = df['Name'].apply(get_first_name)
            df['last_name'] = df['Name'].apply(get_last_name)
            df = pred_gender_fn(df, 'first_name')

            # Check for 'PrimarySample' column
            if 'PrimarySample' in df.columns:
                primary_sample_map = df.drop_duplicates(subset='Institution').set_index('Institution')['PrimarySample']
            else:
                # If 'PrimarySample' doesn't exist, assume False for all
                primary_sample_map = pd.Series(False, index=df['Institution'].unique())

            # Group by Institution and AffiliationId and gender
            group_cols = ['Institution', 'AffiliationId']
            gender_counts = (
                df.groupby(group_cols + ['gender'])
                  .size()
                  .unstack('gender', fill_value=0)
                  .reset_index()
            )

            # Ensure all gender columns are present
            for col in ['male', 'female', 'unknown']:
                if col not in gender_counts.columns:
                    gender_counts[col] = 0

            # Add total_members from male/female/unknown columns
            gender_counts['total_members'] = gender_counts[['male', 'female', 'unknown']].sum(axis=1)

            # Map PrimarySample
            gender_counts['PrimarySample'] = gender_counts['Institution'].map(primary_sample_map).fillna(False)

            return gender_counts

        # Preprocess and count for board_df
        board_gender_counts = preprocess_and_count(board_df)
        board_gender_counts.rename(columns={
            'male': 'male_board',
            'female': 'female_board',
            'unknown': 'unknown_board',
            'total_members': 'total_members_board'
        }, inplace=True)

        # Preprocess and count for double_board_df
        double_board_gender_counts = preprocess_and_count(double_board_df)
        double_board_gender_counts.rename(columns={
            'male': 'male_double',
            'female': 'female_double',
            'unknown': 'unknown_double',
            'total_members': 'total_members_double'
        }, inplace=True)

        # Merge the two gender counts DataFrames
        merged_gender_counts = pd.merge(
            board_gender_counts,
            double_board_gender_counts,
            on=['Institution', 'AffiliationId', 'PrimarySample'],
            how='outer',
            suffixes=('_board', '_double')
        )

        # Fill NaN values with 0 for gender counts
        for col in ['male_board', 'female_board', 'unknown_board', 'total_members_board',
                    'male_double', 'female_double', 'unknown_double', 'total_members_double']:
            if col in merged_gender_counts.columns:
                merged_gender_counts[col] = merged_gender_counts[col].fillna(0)
            else:
                merged_gender_counts[col] = 0

        # Compute weighted average for overlapping institutions
        # Define functions to compute weighted averages
        def weighted_avg(row, col_board, col_double):
            total = row['total_members_board'] + row['total_members_double']
            if total == 0:
                return 0
            return round(
                (row[col_board] * row['total_members_board'] +
                 row[col_double] * row['total_members_double']) / total
            )

        # Apply weighted averages for each gender and cast to int
        merged_gender_counts['male'] = merged_gender_counts.apply(
            lambda row: int(weighted_avg(row, 'male_board', 'male_double')), axis=1
        )
        merged_gender_counts['female'] = merged_gender_counts.apply(
            lambda row: int(weighted_avg(row, 'female_board', 'female_double')), axis=1
        )
        merged_gender_counts['unknown'] = merged_gender_counts.apply(
            lambda row: int(weighted_avg(row, 'unknown_board', 'unknown_double')), axis=1
        )
        merged_gender_counts['total_members'] = merged_gender_counts.apply(
            lambda row: int(row['total_members_board'] + row['total_members_double']), axis=1
        )

        # Add 'Year' column
        merged_gender_counts['Year'] = year

        # Add 'female_president' column
        merged_gender_counts['female_president'] = merged_gender_counts['Institution'].map(
            lambda inst: president_data.get(inst, {}).get('female_president', False)
        )

        # Reorder and select relevant columns
        main_cols = ['Year', 'Institution', 'AffiliationId', 'female_president', 'PrimarySample', 'total_members']
        other_cols = ['male', 'female', 'unknown']
        counts_df = merged_gender_counts[main_cols + other_cols]

        numbers_list.append(counts_df)

    # Concatenate data from all years
    numbers_df = pd.concat(numbers_list, ignore_index=True)
    numbers_df.fillna(0, inplace=True)

    # Sort
    numbers_df.sort_values(by=['Institution', 'AffiliationId', 'Year'], inplace=True)

    # Calculate yearly changes for gender columns
    gender_columns = ['male', 'female', 'unknown']
    for col in gender_columns:
        numbers_df[col + '_change'] = numbers_df.groupby(['Institution', 'AffiliationId'])[col].diff().fillna(0).astype(int)

    # Final sort
    numbers_df.sort_values(by=['Year', 'Institution'], inplace=True)

    return numbers_df




In [41]:
university_board_statistics_df = process_board_data()
university_board_statistics_df.to_csv(altered_dataframe_path + "university_board_statistics.csv", index = False)

  gender_counts['PrimarySample'] = gender_counts['Institution'].map(primary_sample_map).fillna(False)
  gender_counts['PrimarySample'] = gender_counts['Institution'].map(primary_sample_map).fillna(False)
  gender_counts['PrimarySample'] = gender_counts['Institution'].map(primary_sample_map).fillna(False)
  gender_counts['PrimarySample'] = gender_counts['Institution'].map(primary_sample_map).fillna(False)


In [42]:
def map_affiliation_id(affiliation_df, university_board_statistics_df):
    count_updates = 0  # Counter for the number of updates

    for index, row in affiliation_df.iterrows():
        institution = row['FullName']
        affiliation_id = row['AffiliationId']
        
        # Get the number of matching rows before updating
        matching_rows = university_board_statistics_df[
            (university_board_statistics_df['Institution'] == institution) & 
            (university_board_statistics_df['AffiliationId'].isna())
        ]
        
        # Update AffiliationId if matches are found
        if not matching_rows.empty:
            count_updates += len(matching_rows)
            university_board_statistics_df.loc[
                (university_board_statistics_df['Institution'] == institution) & 
                (university_board_statistics_df['AffiliationId'].isna()), 'AffiliationId'
            ] = affiliation_id

    # Print the total number of updates made
    print(f"Total AffiliationId entries updated: {count_updates}")

    return university_board_statistics_df

In [43]:
#combined a couple different scripts here so the variable names are slightly changed and makes code a bit messy 


university_board_statistics = university_board_statistics_df
state_system_df = pd.read_csv(state_path)

# Set default value of StateSystem to False
university_board_statistics['StateSystem'] = False

for index, row in state_system_df.iterrows():
    if pd.notna(row['StateSystem']):
        institution = row['Institution']
        affiliation_id = row['AffiliationId']
        state_system_value = row['StateSystem']
        
        university_board_statistics.loc[
            (university_board_statistics['Institution'] == institution) | 
            (university_board_statistics['AffiliationId'] == affiliation_id), 
            'StateSystem'
        ] = state_system_value

university_board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)

  university_board_statistics.loc[


In [44]:
# for year in years:
#     # Load the data for the current year
#     boards_df = pd.read_csv(f"{boards_path}{year}_boards.csv")
    
#     # Group by Institution and aggregate unique Name values into sets
#     grouped = boards_df.groupby('Institution')['Name'].apply(set).reset_index()

#     # Track processed institutions and new state system rows
#     processed_institutions = set()
#     new_state_system_rows = []

#     # Create a mapping of name sets (as tuples) to lists of institutions that share them
#     name_to_institutions = {}

#     for i, group in grouped.iterrows():
#         institution = group['Institution']
#         names_set = tuple(group['Name'])  # Convert set to a tuple to make it hashable
        
#         # Group institutions by their name sets
#         if names_set not in name_to_institutions:
#             name_to_institutions[names_set] = []
#         name_to_institutions[names_set].append(institution)

#     # Process each group of institutions that share the same name set
#     for names_set, institutions in name_to_institutions.items():
#         if len(institutions) > 1:  # Only consider groups with duplicates
#             processed_institutions.update(institutions)
            
#             # Check if any of the institutions have a state system value
#             is_state_system = university_board_statistics.loc[
#                 (university_board_statistics['Institution'].isin(institutions)) & 
#                 (university_board_statistics['StateSystem'].notna()) &
#                 (university_board_statistics['StateSystem'] != False)
#             ].any().any()

#             if is_state_system:
#                 # Extract the StateSystem value
#                 state_system_name = university_board_statistics.loc[
#                     (university_board_statistics['Institution'].isin(institutions)) & 
#                     (university_board_statistics['StateSystem'].notna()) &
#                     (university_board_statistics['StateSystem'] != False), 'StateSystem'
#                 ].values[0]

#                 # Use one of the institutions' diversity statistics
#                 source_stats = university_board_statistics.loc[
#                     (university_board_statistics['Institution'].isin(institutions)) & 
#                     (university_board_statistics['Year'] == int(year))
#                 ]

#                 if not source_stats.empty:
#                     source_stats = source_stats.iloc[0]
#                     new_row = {
#                         'Year': int(year),
#                         'Institution': state_system_name,
#                         'carnegie_id': np.nan,
#                         'AffiliationId': np.nan,
#                         'female_president': source_stats['female_president'],
#                         # 'poc_president': source_stats['poc_president'],
#                         # 'poc': source_stats['poc'],
#                         # 'white': source_stats['white'],
#                         'female': source_stats['female'],
#                         'male': source_stats['male'],
#                         'unknown': source_stats['unknown'],
#                         # 'poc_change': source_stats['poc_change'],
#                         # 'white_change': source_stats['white_change'],
#                         'unknown_change': source_stats['unknown_change'],
#                         'male_change': source_stats['male_change'],
#                         'female_change': source_stats['female_change'],
#                         'StateSystem': True
#                     }
#                     new_state_system_rows.append(new_row)

#     # Remove duplicate entries from the university statistics
#     university_board_statistics = university_board_statistics[
#         ~((university_board_statistics['Institution'].isin(processed_institutions)) & 
#           (university_board_statistics['Year'] == int(year)))
#     ]

#     # Append new state system rows, if any
#     if new_state_system_rows:
#         new_rows_df = pd.DataFrame(new_state_system_rows)
#         university_board_statistics = pd.concat([university_board_statistics, new_rows_df], ignore_index=True)
#         print(f"Replaced {len(processed_institutions)} duplicate entries with state system rows for year {year}.")

# # Save the updated dataframe to CSV
# university_board_statistics = university_board_statistics.sort_values(by = ["Year", "Institution"])
# university_board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)
# print("Updated university_board_statistics saved.")


In [45]:
# -- Refactored Code --

# 1. Remove any existing 'carnegie_id', 'state', 'control' columns
board_statistics.drop(columns=['carnegie_id', 'state', 'control'], errors='ignore', inplace=True)

# 2. Merge 'carnegie_id' from carnegie_map based on 'AffiliationId'
board_statistics = board_statistics.merge(
    carnegie_map[['AffiliationId', 'carnegie_id']], 
    on='AffiliationId', 
    how='left'
)

# 3. Merge 'state' and 'control' from classification_map based on 'carnegie_id'
board_statistics = board_statistics.merge(
    classification_map[['unitid', 'state', 'control']], 
    left_on='carnegie_id', 
    right_on='unitid', 
    how='left', 
    suffixes=('', '_classification')
)

# 4. Drop the redundant 'unitid' column
board_statistics.drop(columns='unitid', inplace=True)

# 5. Rename columns if suffixes were added
board_statistics.rename(
    columns={'state_classification': 'state', 'control_classification': 'control'},
    inplace=True
)

# 6. Remove any columns that start with "Unnamed"
board_statistics = board_statistics.loc[:, ~board_statistics.columns.str.contains('^Unnamed')]

# Final DataFrame Verification
print("\nMerged board_statistics DataFrame:")
print(board_statistics)

# Check for (and confirm removal of) unnamed columns
unnamed_columns = [col for col in board_statistics.columns if 'Unnamed' in col]
if unnamed_columns:
    print("\nUnnamed Columns Detected and Removed:")
    print(unnamed_columns)
else:
    print("\nNo Unnamed Columns Detected.")


board_statistics['control'] = board_statistics['control'].apply(
    lambda x: 'Private' if 'Private' in str(x) else 'Public' if 'Public' in str(x) else x
)



Merged board_statistics DataFrame:
      Year                      Institution  AffiliationId  female_president  \
0     1999     Abilene Christian University     60205797.0             False   
1     1999               Adelphi University     71965598.0             False   
2     1999              Agnes Scott College     64506506.0              True   
3     1999                   Albion College     45644089.0             False   
4     1999                Alfred University     49502546.0             False   
...    ...                              ...            ...               ...   
5223  2018  Worcester Polytechnic Institute    107077323.0              True   
5224  2018                Xavier University    194120229.0             False   
5225  2018   Xavier University Of Louisiana    169251466.0             False   
5226  2018                  Yale University     32971472.0             False   
5227  2018               Yeshiva University     19772626.0             False   

   

In [46]:
state_to_region = {
    # Northeast
    "CT": "Northeast", "ME": "Northeast", "MA": "Northeast", "NH": "Northeast",
    "RI": "Northeast", "VT": "Northeast", "NJ": "Northeast", "NY": "Northeast", 
    "PA": "Northeast", "DC": "Northeast",
    
    # Midwest
    "IL": "Midwest", "IN": "Midwest", "IA": "Midwest", "KS": "Midwest",
    "MI": "Midwest", "MN": "Midwest", "MO": "Midwest", "NE": "Midwest", 
    "ND": "Midwest", "OH": "Midwest", "SD": "Midwest", "WI": "Midwest",
    
    # South
    "AL": "South", "AR": "South", "DE": "South", "FL": "South",
    "GA": "South", "KY": "South", "LA": "South", "MD": "South",
    "MS": "South", "NC": "South", "OK": "South", "SC": "South",
    "TN": "South", "TX": "South", "VA": "South", "WV": "South",
    
    # West
    "AK": "West", "AZ": "West", "CA": "West", "CO": "West",
    "HI": "West", "ID": "West", "MT": "West", "NV": "West",
    "NM": "West", "OR": "West", "UT": "West", "WA": "West",
    "WY": "West"
}
# board_statistics["regoio"]
board_statistics['region'] = board_statistics['state'].map(state_to_region)
board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)
