In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import os

In [71]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary_data = "temporary_data\\"
college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
state_path = f"{absolute_path}{temporary_data}state_systems_validated.csv"   
college_matching_path = f"{absolute_path}{college_matching}"

# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

In [72]:
carnegie_map = pd.read_csv(f"{college_matching_path}carnegie_map_openalex.csv")
board_statistics = pd.read_csv(f"{altered_dataframe_path}sample_board_statistics.csv")
classification_map = pd.read_csv(f"{college_matching_path}cc_download.csv")


In [73]:
# -- Refactored Code --

# 1. Remove any existing 'carnegie_id', 'state', 'control' columns
board_statistics.drop(columns=['carnegie_id', 'state', 'control'], errors='ignore', inplace=True)

# 2. Merge 'carnegie_id' from carnegie_map based on 'AffiliationId'
board_statistics = board_statistics.merge(
    carnegie_map[['AffiliationId', 'carnegie_id']], 
    on='AffiliationId', 
    how='left'
)

# 3. Merge 'state' and 'control' from classification_map based on 'carnegie_id'
board_statistics = board_statistics.merge(
    classification_map[['unitid', 'state', 'control']], 
    left_on='carnegie_id', 
    right_on='unitid', 
    how='left', 
    suffixes=('', '_classification')
)

# 4. Drop the redundant 'unitid' column
board_statistics.drop(columns='unitid', inplace=True)

# 5. Rename columns if suffixes were added
board_statistics.rename(
    columns={'state_classification': 'state', 'control_classification': 'control'},
    inplace=True
)

# 6. Remove any columns that start with "Unnamed"
board_statistics = board_statistics.loc[:, ~board_statistics.columns.str.contains('^Unnamed')]

# Final DataFrame Verification
print("\nMerged board_statistics DataFrame:")
print(board_statistics)

# Check for (and confirm removal of) unnamed columns
unnamed_columns = [col for col in board_statistics.columns if 'Unnamed' in col]
if unnamed_columns:
    print("\nUnnamed Columns Detected and Removed:")
    print(unnamed_columns)
else:
    print("\nNo Unnamed Columns Detected.")


board_statistics['control'] = board_statistics['control'].apply(
    lambda x: 'Private' if 'Private' in str(x) else 'Public' if 'Public' in str(x) else x
)



Merged board_statistics DataFrame:
      Year                      Institution  AffiliationId  female_president  \
0     1999     Abilene Christian University     60205797.0             False   
1     1999               Adelphi University     71965598.0             False   
2     1999              Agnes Scott College     64506506.0              True   
3     1999                   Albion College     45644089.0             False   
4     1999                Alfred University     49502546.0             False   
...    ...                              ...            ...               ...   
3975  2018  Worcester Polytechnic Institute    107077323.0              True   
3976  2018                Xavier University    194120229.0             False   
3977  2018   Xavier University Of Louisiana    169251466.0             False   
3978  2018                  Yale University     32971472.0             False   
3979  2018               Yeshiva University     19772626.0             False   

   

In [74]:
state_to_region = {
    # Northeast
    "CT": "Northeast", "ME": "Northeast", "MA": "Northeast", "NH": "Northeast",
    "RI": "Northeast", "VT": "Northeast", "NJ": "Northeast", "NY": "Northeast", 
    "PA": "Northeast", "DC": "Northeast",
    
    # Midwest
    "IL": "Midwest", "IN": "Midwest", "IA": "Midwest", "KS": "Midwest",
    "MI": "Midwest", "MN": "Midwest", "MO": "Midwest", "NE": "Midwest", 
    "ND": "Midwest", "OH": "Midwest", "SD": "Midwest", "WI": "Midwest",
    
    # South
    "AL": "South", "AR": "South", "DE": "South", "FL": "South",
    "GA": "South", "KY": "South", "LA": "South", "MD": "South",
    "MS": "South", "NC": "South", "OK": "South", "SC": "South",
    "TN": "South", "TX": "South", "VA": "South", "WV": "South",
    
    # West
    "AK": "West", "AZ": "West", "CA": "West", "CO": "West",
    "HI": "West", "ID": "West", "MT": "West", "NV": "West",
    "NM": "West", "OR": "West", "UT": "West", "WA": "West",
    "WY": "West"
}
# board_statistics["regoio"]
board_statistics['region'] = board_statistics['state'].map(state_to_region)
board_statistics.to_csv(f"{altered_dataframe_path}sample_board_statistics.csv", index=False)
