In [481]:
import pandas as pd
import os
import string

In [482]:
absolute_path = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
board_dataframes = "board_dataframes\\"
final_scripts = "final_scripts\\"
regression = "regression\\"
normalized_dataframes = "normalized_dataframes\\"


years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

In [483]:
#time invariant dfs
affiliation_df = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_systems.csv")
original_affiliation_df  = pd.read_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\affiliation_original.csv")

#create a dictionary mapping state system -> institutions within the system
affiliation_dict = affiliation_df.groupby("StateSystem")["AffiliationId"].apply(list).to_dict()

affiliation_inverted = {
    aff_id: state_system
    for state_system, aff_ids in affiliation_dict.items()
    for aff_id in aff_ids
}

# for key, val in affiliation_dict.items():
#     print(key)


In [484]:
def remove_non_samples(df: pd.DataFrame) -> pd.DataFrame:
    """Filter the DataFrame to include only rows where 'PrimarySample' is True."""
    return df[df['PrimarySample'] == True]


In [485]:
'''map the state systems to an affiliation id if they have one'''

matched_df = original_affiliation_df[original_affiliation_df["FullName"].isin(affiliation_dict.keys())].copy()

matched_df["StateSystem"] = matched_df["FullName"]

mapping_df = matched_df[["StateSystem", "AffiliationId"]]

mapping_df = mapping_df.drop_duplicates()


mapping_df.to_csv(f"{absolute_path}{final_scripts}{regression}regression_stats\\system_mapping.csv", index = False)


In [486]:
system_id_map = {}
nan_id_map = {}

for _, row in mapping_df.iterrows():
    state_system = row["StateSystem"]
    board_aff_id = row["AffiliationId"]
    if pd.isna(board_aff_id):
        nan_id_map[state_system] = affiliation_dict.get(state_system, [])
    else:
        system_id_map[board_aff_id] = affiliation_dict.get(state_system, [])

# Display the resulting dictionary
print(system_id_map)
print(nan_id_map)

{127339247.0: [184813773, 67328108, 67328108, 142934699, 142934699, 142934699, 59897056, 59897056, 43369023, 43369023, 26538001, 71838634], 174216632.0: [125687163, 125687163, 125687163], 4210131357.0: [92446798, 92446798, 92446798, 92446798], 4210165361.0: [120156002, 106969075, 155093810], 2801365651.0: [189590672, 146416000, 39587148], 4210141039.0: [61937129, 99041443, 368840534, 44854399], 4210127926.0: [57328836, 24571045, 24571045], 29957033.0: [200885203, 161171246, 161171246], 1327163397.0: [392282, 392282, 392282, 392282, 123946342, 123946342, 123946342, 123946342, 63190737, 63190737, 63190737, 63190737, 59553526, 59553526, 59553526, 59553526, 59553526], 2801649442.0: [8248082, 63772739, 19700959, 106165777, 33213144, 11874761, 2613432], 173268674.0: [75063564, 91045830, 91045830, 91045830, 206651237, 206651237, 96749437, 96749437, 164185940, 164185940, 181414168, 181414168], 2801273398.0: [191429286, 13511017, 13511017, 13511017, 13511017], 4210088475.0: [12315562, 926076166

In [None]:
#create a mapping from AffiliationId to FullName from original_affiliation_df
inst_mapping = dict(zip(original_affiliation_df["AffiliationId"], original_affiliation_df["FullName"]))
primary_sample_mapping = dict(
    original_affiliation_df.drop_duplicates("AffiliationId").set_index("AffiliationId")["PrimarySample"]
)

for year in years:
    board_path = os.path.join(absolute_path, board_dataframes, f"{year}_boards.csv")
    double_board_path = os.path.join(absolute_path, board_dataframes, f"{year}_double_board.csv")
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    board_df['AffiliationId'] = board_df['AffiliationId'].fillna(0)
    double_board_df['AffiliationId'] = double_board_df['AffiliationId'].fillna(0)
    #get set of existing affiliation ids from the boards to check if there is already a board when we expand
    existing_affiliation_ids_board = set(board_df["AffiliationId"].unique())
    existing_affiliation_ids_double = set(double_board_df["AffiliationId"].unique())


    expanded_groups = []
    for affiliation_id, group in board_df.groupby("AffiliationId"):
        expanded_groups.append(group)
        #if the current institution is in the state system map
        if affiliation_id in system_id_map:
            for new_affiliation_id in system_id_map[affiliation_id]:
                #If the new affiliation id already exists
                if new_affiliation_id in existing_affiliation_ids_board:
                    # For years greater than 2010, overwrite the existing group (these schools are always reported under the same board. we overwrite because the board is incorrect initially)
                    if int(year) > 2010:
                        expanded_groups = [g for g in expanded_groups if g["AffiliationId"].iloc[0] != new_affiliation_id]
                        print(f"{year}: Overwriting group for affiliation id {new_affiliation_id}")
                    else:
                        # Otherwise, skip expanding this group (in the previous years, they report these boards separately a lot of the time)
                        continue
                new_group = group.copy()
                new_group["AffiliationId"] = new_affiliation_id
                #lookup the institution name using the new affiliation id
                new_inst = inst_mapping.get(new_affiliation_id, new_group["Institution"].iloc[0])
                new_group["Institution"] = new_inst
                expanded_groups.append(new_group)
    expanded_board_df = pd.concat(expanded_groups, ignore_index=True)

    #Process double_board_df similarly:
    expanded_groups_db = []
    for affiliation_id, group in double_board_df.groupby("AffiliationId"):
        expanded_groups_db.append(group)
        if affiliation_id in system_id_map:
            for new_affiliation_id in system_id_map[affiliation_id]:
                if new_affiliation_id in existing_affiliation_ids_double:
                    if int(year) > 2010:
                        expanded_groups_db = [g for g in expanded_groups_db if g["AffiliationId"].iloc[0] != new_affiliation_id]
                        print(f"{year}: Overwriting double board group for affiliation id {new_affiliation_id}")
                    else:
                        continue
                new_group = group.copy()
                new_group["AffiliationId"] = new_affiliation_id
                new_inst = inst_mapping.get(new_affiliation_id, new_group["Institution"].iloc[0])
                new_group["Institution"] = new_inst
                expanded_groups_db.append(new_group)
    expanded_double_board_df = pd.concat(expanded_groups_db, ignore_index=True)

    #Update PrimarySample column because when we map from the system -> new insittution, it says primary sample is false because the system itself is not a sample
    expanded_board_df["PrimarySample"] = expanded_board_df["AffiliationId"].map(primary_sample_mapping)
    expanded_double_board_df["PrimarySample"] = expanded_double_board_df["AffiliationId"].map(primary_sample_mapping)
    
    expanded_board_df.sort_values(by=['Institution'], inplace=True)
    expanded_double_board_df.sort_values(by=['Institution'], inplace=True)

    expanded_board_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_boards_regression.csv"), index=False)
    expanded_double_board_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_double_boards_regression.csv"), index=False)


In [None]:
'''
check for system-> institutions by the institution name
This is a backup, because some of the systems dont have an affiliation id
it only edits the remaining systems that have yet to be mapped, doesnt edit anything else
'''

'''
check for system-> institutions by the institution name
This is a backup, because some of the systems dont have an affiliation id
it only edits the remaining systems that have yet to be mapped, doesnt edit anything else
'''

def normalize_institution(s):
    # Remove punctuation, strip whitespace, and convert to title case
    return s.strip().translate(str.maketrans('', '', string.punctuation)).title()

#Create a normalized version of nan_id_map using the normalized institution name as key.
normalized_nan_id_map = {normalize_institution(key): value for key, value in nan_id_map.items()}

inst_mapping = dict(zip(original_affiliation_df["AffiliationId"], original_affiliation_df["FullName"]))
primary_sample_mapping = dict(
    original_affiliation_df.drop_duplicates("AffiliationId").set_index("AffiliationId")["PrimarySample"]
)

for year in years:
    board_path = os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_boards_regression.csv")
    double_board_path = os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_double_boards_regression.csv")
    board_df = pd.read_csv(board_path)
    double_board_df = pd.read_csv(double_board_path)
    
    #get the set of existing affiliation ids from the original board data.
    existing_affiliation_ids_board = set(board_df["AffiliationId"].unique())
    existing_affiliation_ids_double = set(double_board_df["AffiliationId"].unique())
    
    # Process board_df:
    expanded_groups = []
    for institution, group in board_df.groupby("Institution"):
        expanded_groups.append(group)
        norm_institution = normalize_institution(institution)
        print(norm_institution, normalized_nan_id_map)
        #expand the board if exists in the systems dict
        if norm_institution in normalized_nan_id_map:
            print(f"{year}: Expanding for institution '{institution}' (normalized: '{norm_institution}')")
            for new_affiliation_id in normalized_nan_id_map[norm_institution]:
                #Check if a group for this new affiliation id already exists.
                if new_affiliation_id in existing_affiliation_ids_board:
                    if int(year) > 2010:
                        expanded_groups = [g for g in expanded_groups if g["AffiliationId"].iloc[0] != new_affiliation_id]
                        print(f"{year}: Overwriting group for affiliation id {new_affiliation_id}")
                    else:
                        continue
                new_group = group.copy()
                new_group["AffiliationId"] = new_affiliation_id
                new_inst = inst_mapping.get(new_affiliation_id, institution)
                new_group["Institution"] = new_inst
                expanded_groups.append(new_group)
    expanded_board_df = pd.concat(expanded_groups, ignore_index=True)
    
    #Process double_board_df similarly:
    expanded_groups_db = []
    for institution, group in double_board_df.groupby("Institution"):
        expanded_groups_db.append(group)
        norm_institution = normalize_institution(institution)
        if norm_institution in normalized_nan_id_map:
            for new_affiliation_id in normalized_nan_id_map[norm_institution]:
                if new_affiliation_id in existing_affiliation_ids_double:
                    if int(year) > 2010:
                        expanded_groups_db = [g for g in expanded_groups_db 
                                              if g["AffiliationId"].iloc[0] != new_affiliation_id]
                        print(f"{year}: Overwriting double board group for affiliation id {new_affiliation_id}")
                    else:
                        continue
                new_group = group.copy()
                new_group["AffiliationId"] = new_affiliation_id
                new_inst = inst_mapping.get(new_affiliation_id, institution)
                new_group["Institution"] = new_inst
                expanded_groups_db.append(new_group)
    expanded_double_board_df = pd.concat(expanded_groups_db, ignore_index=True)
    
    expanded_board_df["PrimarySample"] = expanded_board_df["AffiliationId"].map(primary_sample_mapping)
    expanded_double_board_df["PrimarySample"] = expanded_double_board_df["AffiliationId"].map(primary_sample_mapping)
    
    expanded_board_df.sort_values(by=['Institution'], inplace=True)
    expanded_double_board_df.sort_values(by=['Institution'], inplace=True)
    
    expanded_board_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_boards_regression.csv"), index=False)
    expanded_double_board_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_double_boards_regression.csv"), index=False)


Abilene Christian University {'Arizona Board Of Regents': [55732556, 203172682, 138006243], 'Indiana University System': [4210119109, 55769427], 'Oklahoma State System Of Higher Education': [115475287, 8692664], 'Tennessee Board Of Regents': [119443389, 169615421, 75256744, 63920570, 94658018]}
Adelphi University {'Arizona Board Of Regents': [55732556, 203172682, 138006243], 'Indiana University System': [4210119109, 55769427], 'Oklahoma State System Of Higher Education': [115475287, 8692664], 'Tennessee Board Of Regents': [119443389, 169615421, 75256744, 63920570, 94658018]}
Agnes Scott College {'Arizona Board Of Regents': [55732556, 203172682, 138006243], 'Indiana University System': [4210119109, 55769427], 'Oklahoma State System Of Higher Education': [115475287, 8692664], 'Tennessee Board Of Regents': [119443389, 169615421, 75256744, 63920570, 94658018]}
Albion College {'Arizona Board Of Regents': [55732556, 203172682, 138006243], 'Indiana University System': [4210119109, 55769427], 

In [None]:
# '''
# This code is essentially supposed to union the state system boards together when they are reported separately (pre 2009)
# However, the boards seem to actually be separate a lot of the times, which i am confused by. Is the book wrong? 
#     Ohio state system reports under one board later on, but it seems like this is not what happens in real life
#     Same thing with florida state? It says they started merging in 2003, but they dont actually have the same board until 2009 df
# '''


# # # Create a set of unique systems (each system is a set of affiliation IDs) from system_id_map.
# unique_systems = set()
# for vals in system_id_map.values():
#     unique_systems.add(frozenset(vals))
# unique_systems = [set(s) for s in unique_systems]  # Convert frozenset back to set for easier use

# # We'll store a report of additions per system
# system_report = {}

# for year in years:
#     board_path = os.path.join(absolute_path, board_dataframes, f"{year}_boards.csv")
#     double_board_path = os.path.join(absolute_path, board_dataframes, f"{year}_double_board.csv")
#     board_df = pd.read_csv(board_path)
#     double_board_df = pd.read_csv(double_board_path)
    
#     # Make copies that we can update
#     updated_board_df = board_df.copy()
#     updated_double_board_df = double_board_df.copy()
    
#     # For reporting additions in this year
#     year_report = {}
    
#     # Collect new rows for board_df in a list (to avoid repeated concat calls)
#     new_rows = []
    
#     # --- Process board_df for system unioning ---
#     for system in unique_systems:
#         # Get all rows in the system (any institution whose AffiliationId is in the system)
#         system_subset = updated_board_df[ updated_board_df["AffiliationId"].isin(system) ]
#         # Compute the union of board member names across the system
#         union_names = set(system_subset["Name"].unique())
        
#         # For each institution (target affiliation) in this system:
#         for inst_id in system:
#             inst_rows = updated_board_df[ updated_board_df["AffiliationId"] == inst_id ]
#             current_names = set(inst_rows["Name"].unique())
#             missing_names = union_names - current_names
            
#             current_count = len(inst_rows)
#             # Only add if the missing names count would not exceed a 20% increase
#             if current_count == 0 or len(missing_names) > 0.2 * current_count:
#                 continue  # Skip unioning for this institution
            
#             added_count = 0
#             # Use a representative row from the target institution to get institution-specific info
#             rep_target = inst_rows.iloc[0].copy()
#             for missing_name in missing_names:
#                 # For board member–specific details, pick a representative row from the union where Name==missing_name
#                 rep_union_rows = system_subset[ system_subset["Name"] == missing_name ]
#                 if rep_union_rows.empty:
#                     continue
#                 rep_union = rep_union_rows.iloc[0].copy()
                
#                 # Create new row: copy all board member–specific info from rep_union,
#                 # but override institution-specific columns with the target institution's info.
#                 new_row = rep_union.copy()
#                 new_row["Name"] = missing_name  # Should already be missing_name
#                 new_row["Institution"] = rep_target["Institution"]
#                 new_row["AffiliationId"] = rep_target["AffiliationId"]
#                 new_row["carnegie_id"] = rep_target["carnegie_id"]
#                 new_row["Added"] = True  # Flag to indicate this row was added by unioning
#                 new_rows.append(new_row)
#                 added_count += 1
#             if added_count > 0:
#                 year_report.setdefault(inst_id, 0)
#                 year_report[inst_id] += added_count
    
#     # Append the new rows (if any) to updated_board_df.
#     if new_rows:
#         new_rows_df = pd.DataFrame(new_rows)
#         # Ensure new_rows_df has the same column order as updated_board_df
#         new_rows_df = new_rows_df[updated_board_df.columns]
#         updated_board_df = pd.concat([updated_board_df, new_rows_df], ignore_index=True)
    
#     # Remove any duplicates within each institution based on ("AffiliationId", "Name")
#     updated_board_df = updated_board_df.drop_duplicates(subset=["AffiliationId", "Name"])
    
#     # --- Process double_board_df similarly ---
#     new_rows_db = []
#     double_year_report = {}
#     for system in unique_systems:
#         system_subset_db = updated_double_board_df[ updated_double_board_df["AffiliationId"].isin(system) ]
#         union_names_db = set(system_subset_db["Name"].unique())
        
#         for inst_id in system:
#             inst_rows_db = updated_double_board_df[ updated_double_board_df["AffiliationId"] == inst_id ]
#             current_names_db = set(inst_rows_db["Name"].unique())
#             missing_names_db = union_names_db - current_names_db
            
#             current_count_db = len(inst_rows_db)
#             if current_count_db == 0 or len(missing_names_db) > 0.2 * current_count_db:
#                 continue
            
#             added_count = 0
#             rep_target_db = inst_rows_db.iloc[0].copy()
#             for missing_name in missing_names_db:
#                 rep_union_rows_db = system_subset_db[ system_subset_db["Name"] == missing_name ]
#                 if rep_union_rows_db.empty:
#                     continue
#                 rep_union_db = rep_union_rows_db.iloc[0].copy()
#                 new_row_db = rep_union_db.copy()
#                 new_row_db["Name"] = missing_name
#                 new_row_db["Institution"] = rep_target_db["Institution"]
#                 new_row_db["AffiliationId"] = rep_target_db["AffiliationId"]
#                 new_row_db["carnegie_id"] = rep_target_db["carnegie_id"]
#                 new_row_db["Added"] = True
#                 new_rows_db.append(new_row_db)
#                 added_count += 1
#             if added_count > 0:
#                 double_year_report.setdefault(inst_id, 0)
#                 double_year_report[inst_id] += added_count
                
#     if new_rows_db:
#         new_rows_db_df = pd.DataFrame(new_rows_db)
#         new_rows_db_df = new_rows_db_df[updated_double_board_df.columns]
#         updated_double_board_df = pd.concat([updated_double_board_df, new_rows_db_df], ignore_index=True)
#     updated_double_board_df = updated_double_board_df.drop_duplicates(subset=["AffiliationId", "Name"])
    
#     # Print report for this year
#     print(f"Year {year} board expansion report:")
#     for inst_id, count in year_report.items():
#         print(f" - Institution with AffiliationId {inst_id}: added {count} board members.")
#     print(f"Year {year} double board expansion report:")
#     for inst_id, count in double_year_report.items():
#         print(f" - Institution with AffiliationId {inst_id}: added {count} board members.")
    
#     # Sort the DataFrames by Institution and AffiliationId before saving
#     updated_board_df.sort_values(by=['Institution', 'AffiliationId'], inplace=True)
#     updated_double_board_df.sort_values(by=['Institution', 'AffiliationId'], inplace=True)
    
#     # Write the updated DataFrames to CSV
#     updated_board_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_boards_regression.csv"), index=False)
#     updated_double_board_df.to_csv(os.path.join(absolute_path, f"{final_scripts}{regression}regression_boards", f"{year}_double_boards_regression.csv"), index=False)
    
#     # Store the report for this year
#     system_report[year] = {"boards": year_report, "double_boards": double_year_report}

# print("Overall system expansion report:", system_report)
