In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from matplotlib.lines import Line2D
from nameparser import HumanName
import gender_guesser.detector as gender
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import ethnicolr
from ethnicolr import pred_census_ln




In [2]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"

# Valid Years
years = ["1999", "2000", "2005", "2008", "2009", "2013"]

In [3]:
#Created Files
diversity_statistics_path = f"{altered_dataframe_path}diversity_statistics.csv"

In [4]:
def get_last_name(full_name):
    """
    Extracts the last name from a full name.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The last name.
    """
    full_name = HumanName(full_name)
    return str(full_name.last)

# Helper function to extract the first name from a full name
def get_first_name(full_name):
    """
    Extracts the first name from a full name.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The first name.
    """
    full_name = HumanName(full_name)
    return str(full_name.first)

# Classifies ethnicity into 'white', 'poc', or 'unknown'
def classify_ethnicity(ethnicity):
    """
    Classifies ethnicity into 'white', 'poc', or 'unknown'.

    Args:
        ethnicity (str): The ethnicity label.

    Returns:
        str: The classified ethnicity group.
    """
    if ethnicity == 'white':
        return 'white'
    elif ethnicity in ['api', 'black', 'hispanic', 'aian']:
        return 'poc'
    else:
        return 'unknown'

# Classifies gender into 'male', 'female', or 'unknown'
def classify_gender(gender_str):
    """
    Classifies gender into 'male', 'female', or 'unknown'.

    Args:
        gender_str (str): The raw gender prediction string.

    Returns:
        str: The classified gender group.
    """
    if gender_str in ['male', 'mostly_male']:
        return 'male'
    elif gender_str in ['female', 'mostly_female']:
        return 'female'
    else:
        return 'unknown'

# Predicts gender based on first names and adds a 'gender' column to the dataframe
def pred_gender_fn(df, first_name_col):
    """
    Predicts gender based on first names and classifies it.

    Args:
        df (pd.DataFrame): The dataframe containing the first name column.
        first_name_col (str): The column name containing first names.

    Returns:
        pd.DataFrame: The dataframe with a new 'gender' column.
    """
    d = gender.Detector()
    df['gender'] = df[first_name_col].apply(lambda x: d.get_gender(x))
    df['gender'] = df['gender'].apply(classify_gender)
    return df


def classify_president_genders(valid_years, altered_dataframe_path):
    """
    Classifies gender for each president in the specified years and adds a 'female_president' column.

    Args:
        valid_years (list): A list of years to process.
        altered_dataframe_path (str): The path to the directory containing president CSV files.

    Returns:
        dict: A dictionary with 'Institution' as keys and 'female_president' as boolean values.
    """
    president_data = {}

    for year in valid_years:
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract first names and predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['gender'] = president_df['first_name'].apply(lambda x: gender.Detector().get_gender(x))
        president_df['gender'] = president_df['gender'].apply(classify_gender)

        # Add 'female_president' column
        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)

        # Store results in a dictionary by Institution
        for _, row in president_df.iterrows():
            president_data[row['Institution']] = row['female_president']

    return president_data

def classify_president_genders(valid_years, altered_dataframe_path):
    """
    Classifies gender for each president in the specified years and adds a 'female_president' column.

    Args:
        valid_years (list): A list of years to process.
        altered_dataframe_path (str): The path to the directory containing president CSV files.

    Returns:
        dict: A dictionary with 'Institution' as keys and 'female_president' as boolean values.
    """
    president_data = {}

    for year in valid_years:
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract first names and predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['gender'] = president_df['first_name'].apply(lambda x: gender.Detector().get_gender(x))
        president_df['gender'] = president_df['gender'].apply(classify_gender)

        # Add 'female_president' column
        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)

        # Store results in a dictionary by Institution
        for _, row in president_df.iterrows():
            president_data[row['Institution']] = row['female_president']

    return president_data

def process_board_data(valid_years):
    """
    Processes board data for each year to compute ethnicity and gender counts, predicts president gender,
    and adds 'female_president', 'poc_president', and 'AffiliationId' columns while retaining all original columns 
    and adding diversity change statistics, including 'total_members'.

    Args:
        valid_years (list): A list of years to process.

    Returns:
        pd.DataFrame: A DataFrame with original columns, yearly change statistics, and added president and affiliation data.
    """
    numbers_list = []

    for year in valid_years:
        # Load the president data for the current year
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Filter to include only schools with PrimarySample == True
        president_df = president_df[president_df['PrimarySample'] == True]

        # Extract first and last names and classify gender/ethnicity
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['last_name'] = president_df['Name'].apply(get_last_name)
        president_df = pred_census_ln(president_df, 'last_name')
        president_df = pred_gender_fn(president_df, 'first_name')

        # Classify gender and ethnicity for president
        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)
        president_df['poc_president'] = president_df['race'].apply(lambda x: True if x in ['api', 'black', 'hispanic', 'aian'] else False)

        # Remove duplicates to ensure 'Institution' index is unique, keeping the first occurrence
        president_df = president_df.drop_duplicates(subset='Institution').reset_index(drop=True)

        # Create a dictionary for mapping institutions to 'female_president', 'poc_president', and 'AffiliationId' status
        if president_df['Institution'].is_unique:
            president_data = president_df.set_index('Institution')[['female_president', 'poc_president', 'AffiliationId']].to_dict('index')
        else:
            raise ValueError("Duplicate 'Institution' entries remain after deduplication.")

        # Load the board data for the current year
        board_df_path = f"{boards_path}{year}_boards.csv"
        board_df = pd.read_csv(board_df_path)

        # Filter to include only schools with PrimarySample == True
        board_df = board_df[board_df['PrimarySample'] == True]

        # Extract first and last names for board members
        board_df['first_name'] = board_df['Name'].apply(get_first_name)
        board_df['last_name'] = board_df['Name'].apply(get_last_name)

        # Predict ethnicity using the last name
        board_df = pred_census_ln(board_df, 'last_name')

        # Predict gender using the first name
        board_df = pred_gender_fn(board_df, 'first_name')

        # Classify ethnicity and gender
        board_df['ethnicity_group'] = board_df['race'].apply(classify_ethnicity)
        board_df['gender_group'] = board_df['gender']

        # Compute ethnicity counts
        ethnicity_counts = board_df.groupby(['Institution', 'carnegie_id', 'AffiliationId', 'ethnicity_group']).size().unstack('ethnicity_group', fill_value=0).reset_index()

        # Compute gender counts
        gender_counts = board_df.groupby(['Institution', 'carnegie_id', 'AffiliationId', 'gender_group']).size().unstack('gender_group', fill_value=0).reset_index()

        # Merge ethnicity and gender counts
        counts_df = pd.merge(ethnicity_counts, gender_counts, on=['Institution', 'carnegie_id', 'AffiliationId'], how='outer')

        # Add a column for total members
        counts_df['total_members'] = counts_df[['male', 'female', 'unknown']].sum(axis=1)

        # Add 'Year' column
        counts_df['Year'] = year

        # Add 'female_president', 'poc_president', and 'AffiliationId' columns from pre-processed president data
        counts_df['female_president'] = counts_df['Institution'].map(lambda inst: president_data.get(inst, {}).get('female_president', False))
        counts_df['poc_president'] = counts_df['Institution'].map(lambda inst: president_data.get(inst, {}).get('poc_president', False))
        counts_df['AffiliationId'] = counts_df['Institution'].map(lambda inst: president_data.get(inst, {}).get('AffiliationId', ''))

        # Reorder columns for clarity
        cols = ['Year', 'Institution', 'carnegie_id', 'AffiliationId', 'female_president', 'poc_president', 'total_members'] + [col for col in counts_df.columns if col not in ['Year', 'Institution', 'carnegie_id', 'AffiliationId', 'female_president', 'poc_president', 'total_members']]
        counts_df = counts_df[cols]

        # Append the processed data for the current year
        numbers_list.append(counts_df)

    # Concatenate data from all years and fill NaN values with zeros
    numbers_df = pd.concat(numbers_list, ignore_index=True)
    numbers_df.fillna(0, inplace=True)

    # Sort values by 'Institution', 'carnegie_id', 'AffiliationId', and 'Year'
    numbers_df.sort_values(by=['Institution', 'carnegie_id', 'AffiliationId', 'Year'], inplace=True)

    # Calculate yearly changes for each ethnicity and gender group
    ethnicity_columns = ['poc', 'white', 'unknown']  # Adjust based on actual column names
    gender_columns = ['male', 'female', 'unknown']   # Adjust based on actual column names

    for col in ethnicity_columns + gender_columns:
        # Calculate the difference from the previous year for each institution and affiliation
        numbers_df[col + '_change'] = numbers_df.groupby(['Institution', 'carnegie_id'])[col].diff()

    # Sort values for clarity
    numbers_df.sort_values(by=['Year', 'Institution'], inplace=True)

    return numbers_df


In [5]:
university_board_statistics_df = process_board_data(years)




In [6]:
def map_affiliation_id(affiliation_df, university_board_statistics_df):
    count_updates = 0  # Counter for the number of updates

    for index, row in affiliation_df.iterrows():
        institution = row['FullName']
        affiliation_id = row['AffiliationId']
        
        # Get the number of matching rows before updating
        matching_rows = university_board_statistics_df[
            (university_board_statistics_df['Institution'] == institution) & 
            (university_board_statistics_df['AffiliationId'].isna())
        ]
        
        # Update AffiliationId if matches are found
        if not matching_rows.empty:
            count_updates += len(matching_rows)
            university_board_statistics_df.loc[
                (university_board_statistics_df['Institution'] == institution) & 
                (university_board_statistics_df['AffiliationId'].isna()), 'AffiliationId'
            ] = affiliation_id

    # Print the total number of updates made
    print(f"Total AffiliationId entries updated: {count_updates}")

    return university_board_statistics_df

In [7]:
affiliation_df = pd.read_csv(f"{temporary_data_path}affiliation.csv")
university_board_statistics_df = map_affiliation_id(affiliation_df, university_board_statistics_df)
university_board_statistics_df.to_csv(altered_dataframe_path + "university_board_statistics.csv", index = False)

Total AffiliationId entries updated: 0
