In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from matplotlib.lines import Line2D
from nameparser import HumanName
import gender_guesser.detector as gender
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import ethnicolr
from ethnicolr import pred_census_ln

In [175]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary = "temporary_data\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
temporary_data_path = f"{absolute_path}{temporary}"

# Valid Years
years = ["1999", "2000", "2005", "2008", "2009", "2013"]

diversity_statistics_path = f"{altered_dataframe_path}diversity_statistics.csv"

In [176]:
def get_last_name(full_name):
    """
    Extracts the last name from a full name.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The last name.
    """
    full_name = HumanName(full_name)
    return str(full_name.last)

# Helper function to extract the first name from a full name
def get_first_name(full_name):
    """
    Extracts the first name from a full name.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The first name.
    """
    full_name = HumanName(full_name)
    return str(full_name.first)

# Classifies ethnicity into 'white', 'poc', or 'unknown'
def classify_ethnicity(ethnicity):
    """
    Classifies ethnicity into 'white', 'poc', or 'unknown'.

    Args:
        ethnicity (str): The ethnicity label.

    Returns:
        str: The classified ethnicity group.
    """
    if ethnicity == 'white':
        return 'white'
    elif ethnicity in ['api', 'black', 'hispanic', 'aian']:
        return 'poc'
    else:
        return 'unknown'

# Classifies gender into 'male', 'female', or 'unknown'
def classify_gender(gender_str):
    """
    Classifies gender into 'male', 'female', or 'unknown'.

    Args:
        gender_str (str): The raw gender prediction string.

    Returns:
        str: The classified gender group.
    """
    if gender_str in ['male', 'mostly_male']:
        return 'male'
    elif gender_str in ['female', 'mostly_female']:
        return 'female'
    else:
        return 'unknown'

# Predicts gender based on first names and adds a 'gender' column to the dataframe
def pred_gender_fn(df, first_name_col):
    """
    Predicts gender based on first names and classifies it.

    Args:
        df (pd.DataFrame): The dataframe containing the first name column.
        first_name_col (str): The column name containing first names.

    Returns:
        pd.DataFrame: The dataframe with a new 'gender' column.
    """
    d = gender.Detector()
    df['gender'] = df[first_name_col].apply(lambda x: d.get_gender(x))
    df['gender'] = df['gender'].apply(classify_gender)
    return df


In [177]:
def get_last_name(full_name):
    """
    Extracts the last name from a full name.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The last name.
    """
    full_name = HumanName(full_name)
    return str(full_name.last)

# Helper function to extract the first name from a full name
def get_first_name(full_name):
    """
    Extracts the first name from a full name.

    Args:
        full_name (str): The full name of a person.

    Returns:
        str: The first name.
    """
    full_name = HumanName(full_name)
    return str(full_name.first)

# Classifies ethnicity into 'white', 'poc', or 'unknown'
def classify_ethnicity(ethnicity):
    """
    Classifies ethnicity into 'white', 'poc', or 'unknown'.

    Args:
        ethnicity (str): The ethnicity label.

    Returns:
        str: The classified ethnicity group.
    """
    if ethnicity == 'white':
        return 'white'
    elif ethnicity in ['api', 'black', 'hispanic', 'aian']:
        return 'poc'
    else:
        return 'unknown'

# Classifies gender into 'male', 'female', or 'unknown'
def classify_gender(gender_str):
    """
    Classifies gender into 'male', 'female', or 'unknown'.

    Args:
        gender_str (str): The raw gender prediction string.

    Returns:
        str: The classified gender group.
    """
    if gender_str in ['male', 'mostly_male']:
        return 'male'
    elif gender_str in ['female', 'mostly_female']:
        return 'female'
    else:
        return 'unknown'

# Predicts gender based on first names and adds a 'gender' column to the dataframe
def pred_gender_fn(df, first_name_col):
    """
    Predicts gender based on first names and classifies it.

    Args:
        df (pd.DataFrame): The dataframe containing the first name column.
        first_name_col (str): The column name containing first names.

    Returns:
        pd.DataFrame: The dataframe with a new 'gender' column.
    """
    d = gender.Detector()
    df['gender'] = df[first_name_col].apply(lambda x: d.get_gender(x))
    df['gender'] = df['gender'].apply(classify_gender)
    return df


def classify_president_genders(valid_years, altered_dataframe_path):
    """
    Classifies gender for each president in the specified years and adds a 'female_president' column.

    Args:
        valid_years (list): A list of years to process.
        altered_dataframe_path (str): The path to the directory containing president CSV files.

    Returns:
        dict: A dictionary with 'Institution' as keys and 'female_president' as boolean values.
    """
    president_data = {}

    for year in valid_years:
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract first names and predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['gender'] = president_df['first_name'].apply(lambda x: gender.Detector().get_gender(x))
        president_df['gender'] = president_df['gender'].apply(classify_gender)

        # Add 'female_president' column
        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)

        # Store results in a dictionary by Institution
        for _, row in president_df.iterrows():
            president_data[row['Institution']] = row['female_president']

    return president_data

def classify_president_genders(valid_years, altered_dataframe_path):
    """
    Classifies gender for each president in the specified years and adds a 'female_president' column.

    Args:
        valid_years (list): A list of years to process.
        altered_dataframe_path (str): The path to the directory containing president CSV files.

    Returns:
        dict: A dictionary with 'Institution' as keys and 'female_president' as boolean values.
    """
    president_data = {}

    for year in valid_years:
        president_df_path = f"{altered_dataframe_path}{year}_presidents.csv"
        president_df = pd.read_csv(president_df_path)

        # Extract first names and predict gender
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['gender'] = president_df['first_name'].apply(lambda x: gender.Detector().get_gender(x))
        president_df['gender'] = president_df['gender'].apply(classify_gender)

        # Add 'female_president' column
        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)

        # Store results in a dictionary by Institution
        for _, row in president_df.iterrows():
            president_data[row['Institution']] = row['female_president']

    return president_data


def process_board_data():
    all_years_data = []
    for year in years:
        president_df = pd.read_csv(f'{altered_dataframe_path}{year}_presidents.csv')
        board_df = pd.read_csv(f'{boards_path}{year}_boards.csv')
        all_years_data.append(board_df)
        institutions_df = pd.read_csv(f'{gpt_dataframe_path}{year}_gptDataframe.csv')
        institutions_df = institutions_df[['Institution', 'AffiliationId']].drop_duplicates()

        #extract gender + ethnicity of the presidents
        president_df['first_name'] = president_df['Name'].apply(get_first_name)
        president_df['last_name'] = president_df['Name'].apply(get_last_name)
        president_df = pred_census_ln(president_df, 'last_name')
        president_df = pred_gender_fn(president_df, 'first_name')

        president_df['female_president'] = president_df['gender'].apply(lambda x: True if x == 'female' else False)
        president_df['poc_president'] = president_df['race'].apply(lambda x: True if x in ['api', 'black', 'hispanic', 'aian'] else False)
        president_df = president_df.drop_duplicates(subset='Institution').reset_index(drop=True)

        # Extract first and last names for board members
        board_df['first_name'] = board_df['Name'].apply(get_first_name)
        board_df['last_name'] = board_df['Name'].apply(get_last_name)

        print(board_df[board_df['Name'].isnull()])  # Identify rows with missing names


        print(board_df[board_df['last_name'].isnull()])  # Check for missing last names


        # Predict ethnicity using the last name
        board_df = pred_census_ln(board_df, 'last_name')

        # Predict gender using the first name
        board_df = pred_gender_fn(board_df, 'first_name')

        # Classify ethnicity and gender
        board_df['ethnicity_group'] = board_df['race'].apply(classify_ethnicity)
        board_df['gender_group'] = board_df['gender']
        board_df['Year'] = year

        print(board_df.head())
        # all_years_data.append(board_df)
    all_years_data_df = pd.concat(all_years_data, ignore_index=True)
    return all_years_data_df
        



In [178]:
university_board_statistics_df = process_board_data()
university_board_statistics_df.to_csv(altered_dataframe_path + "university_board_statistics.csv", index = False)

                 Name              Position                   Institution  \
0      G. Thane Akins               Trustee  Abilene Christian University   
1             Ted Poe               Trustee  Abilene Christian University   
2  Hubert Pickett, Jr   Assistant Secretary  Abilene Christian University   
3      H. Lynn Packer               Trustee  Abilene Christian University   
4     Harrold D. Owen  Senior Board Trustee  Abilene Christian University   

  SubInstitution Education                   Other Affiliation Career Notes  \
0            NaN       NaN                 Member of the Board    NaN   NaN   
1            NaN       NaN   Judge, 228th State District Court    NaN   NaN   
2            NaN       NaN  Principal, Jefferson Middle School    NaN   NaN   
3            NaN       NaN                                 NaN    NaN   NaN   
4            NaN       NaN          Owner, Owen Oil Tools, Inc    NaN   NaN   

   AffiliationId  carnegie_id PrimarySample  level_0  index St

In [179]:
def map_affiliation_id(affiliation_df, university_board_statistics_df):
    count_updates = 0  # Counter for the number of updates

    for index, row in affiliation_df.iterrows():
        institution = row['FullName']
        affiliation_id = row['AffiliationId']
        
        # Get the number of matching rows before updating
        matching_rows = university_board_statistics_df[
            (university_board_statistics_df['Institution'] == institution) & 
            (university_board_statistics_df['AffiliationId'].isna())
        ]
        
        # Update AffiliationId if matches are found
        if not matching_rows.empty:
            count_updates += len(matching_rows)
            university_board_statistics_df.loc[
                (university_board_statistics_df['Institution'] == institution) & 
                (university_board_statistics_df['AffiliationId'].isna()), 'AffiliationId'
            ] = affiliation_id

    # Print the total number of updates made
    print(f"Total AffiliationId entries updated: {count_updates}")

    return university_board_statistics_df

In [180]:
# affiliation_df = pd.read_csv(f"{temporary_data_path}affiliation.csv")
# university_board_statistics_df = map_affiliation_id(affiliation_df, university_board_statistics_df)
# university_board_statistics_df.to_csv(altered_dataframe_path + "university_board_statistics.csv", index = False)