In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import os

In [2]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary_data = "temporary_data\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
state_path = f"{absolute_path}{temporary_data}state_systems_validated.csv"   

# Valid Years
years = ["1999", "2000", "2005", "2008", "2009", "2013"]

In [3]:
university_board_statistics = pd.read_csv(f"{altered_dataframe_path}university_board_statistics.csv")
state_system_df = pd.read_csv(state_path)

# Set default value of StateSystem to False
university_board_statistics['StateSystem'] = False

for index, row in state_system_df.iterrows():
    if pd.notna(row['StateSystem']):
        institution = row['Institution']
        affiliation_id = row['AffiliationId']
        state_system_value = row['StateSystem']
        
        university_board_statistics.loc[
            (university_board_statistics['Institution'] == institution) | 
            (university_board_statistics['AffiliationId'] == affiliation_id), 
            'StateSystem'
        ] = state_system_value

university_board_statistics.to_csv(f"{altered_dataframe_path}university_board_statistics.csv", index=False)

  university_board_statistics.loc[


In [4]:
for year in years:
    # Load the data for the current year
    boards_df = pd.read_csv(f"{boards_path}{year}_boards.csv")
    
    # Group by Institution and aggregate unique Name values into sets
    grouped = boards_df.groupby('Institution')['Name'].apply(set).reset_index()

    # Track processed institutions and new state system rows
    processed_institutions = set()
    new_state_system_rows = []

    # Create a mapping of name sets (as tuples) to lists of institutions that share them
    name_to_institutions = {}

    for i, group in grouped.iterrows():
        institution = group['Institution']
        names_set = tuple(group['Name'])  # Convert set to a tuple to make it hashable
        
        # Group institutions by their name sets
        if names_set not in name_to_institutions:
            name_to_institutions[names_set] = []
        name_to_institutions[names_set].append(institution)

    # Process each group of institutions that share the same name set
    for names_set, institutions in name_to_institutions.items():
        if len(institutions) > 1:  # Only consider groups with duplicates
            processed_institutions.update(institutions)
            
            # Check if any of the institutions have a state system value
            is_state_system = university_board_statistics.loc[
                (university_board_statistics['Institution'].isin(institutions)) & 
                (university_board_statistics['StateSystem'].notna()) &
                (university_board_statistics['StateSystem'] != False)
            ].any().any()

            if is_state_system:
                # Extract the StateSystem value
                state_system_name = university_board_statistics.loc[
                    (university_board_statistics['Institution'].isin(institutions)) & 
                    (university_board_statistics['StateSystem'].notna()) &
                    (university_board_statistics['StateSystem'] != False), 'StateSystem'
                ].values[0]

                # Use one of the institutions' diversity statistics
                source_stats = university_board_statistics.loc[
                    (university_board_statistics['Institution'].isin(institutions)) & 
                    (university_board_statistics['Year'] == int(year))
                ]

                if not source_stats.empty:
                    source_stats = source_stats.iloc[0]
                    new_row = {
                        'Year': int(year),
                        'Institution': state_system_name,
                        'carnegie_id': np.nan,
                        'AffiliationId': np.nan,
                        'female_president': source_stats['female_president'],
                        # 'poc_president': source_stats['poc_president'],
                        # 'poc': source_stats['poc'],
                        # 'white': source_stats['white'],
                        'female': source_stats['female'],
                        'male': source_stats['male'],
                        'unknown': source_stats['unknown'],
                        # 'poc_change': source_stats['poc_change'],
                        # 'white_change': source_stats['white_change'],
                        'unknown_change': source_stats['unknown_change'],
                        'male_change': source_stats['male_change'],
                        'female_change': source_stats['female_change'],
                        'StateSystem': True
                    }
                    new_state_system_rows.append(new_row)

    # Remove duplicate entries from the university statistics
    university_board_statistics = university_board_statistics[
        ~((university_board_statistics['Institution'].isin(processed_institutions)) & 
          (university_board_statistics['Year'] == int(year)))
    ]

    # Append new state system rows, if any
    if new_state_system_rows:
        new_rows_df = pd.DataFrame(new_state_system_rows)
        university_board_statistics = pd.concat([university_board_statistics, new_rows_df], ignore_index=True)
        print(f"Replaced {len(processed_institutions)} duplicate entries with state system rows for year {year}.")

# Save the updated dataframe to CSV
university_board_statistics = university_board_statistics.sort_values(by = ["Year", "Institution"])
university_board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)
print("Updated university_board_statistics saved.")


Replaced 6 duplicate entries with state system rows for year 1999.
Replaced 10 duplicate entries with state system rows for year 2000.
Replaced 6 duplicate entries with state system rows for year 2005.
Replaced 11 duplicate entries with state system rows for year 2008.
Replaced 6 duplicate entries with state system rows for year 2009.
Updated university_board_statistics saved.
