In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import os

In [75]:
# File Paths
absolute_path = "C:\\Users\\tykun\\\OneDrive\\Documents\\SchoolDocs\VSCodeProjects\\connectedData\\board_analysis\\"
altered_dataframes = "altered_dataframes\\"
gpt_dataframes = "gpt_dataframes\\"
graphs = "graphs\\"
scripts =  "scripts\\"
board_dataframes = "board_dataframes\\"
temporary_data = "temporary_data\\"
yearly_interlocks = "yearly_interlocks\\"
final_scripts = "final_scripts\\"
normalized_dataframes = "normalized_dataframes\\"
college_matching = "college_matching\\"

altered_dataframe_path = f"{absolute_path}{altered_dataframes}"
gpt_dataframe_path = f"{absolute_path}{gpt_dataframes}" 
graph_path = f"{absolute_path}{graphs}"
script_path = f"{absolute_path}{scripts}"
boards_path = f"{absolute_path}{board_dataframes}"
state_path = f"{absolute_path}{temporary_data}state_systems_validated.csv"   
college_matching_path = f"{absolute_path}{college_matching}"

# Valid Years
years = ["1999", "2000", "2005", "2007", "2008", "2009", "2011", "2013", "2018"]

In [76]:
carnegie_map = pd.read_csv(f"{college_matching_path}carnegie_map_openalex.csv")
file_path = os.path.join(absolute_path, final_scripts, normalized_dataframes, f"sample_board_statistics_normalized.csv")
board_statistics = pd.read_csv(file_path)
classification_map = pd.read_csv(f"{college_matching_path}cc_download.csv")


In [77]:
# -- Refactored Code --

# 1. Remove any existing 'carnegie_id', 'state', 'control' columns
board_statistics.drop(columns=['carnegie_id', 'state', 'control'], errors='ignore', inplace=True)

# 2. Merge 'carnegie_id' from carnegie_map based on 'AffiliationId'
board_statistics = board_statistics.merge(
    carnegie_map[['AffiliationId', 'carnegie_id']], 
    on='AffiliationId', 
    how='left'
)

# 3. Merge 'state' and 'control' from classification_map based on 'carnegie_id'
board_statistics = board_statistics.merge(
    classification_map[['unitid', 'state', 'control']], 
    left_on='carnegie_id', 
    right_on='unitid', 
    how='left', 
    suffixes=('', '_classification')
)

# 4. Drop the redundant 'unitid' column
board_statistics.drop(columns='unitid', inplace=True)

# 5. Rename columns if suffixes were added
board_statistics.rename(
    columns={'state_classification': 'state', 'control_classification': 'control'},
    inplace=True
)

# 6. Remove any columns that start with "Unnamed"
board_statistics = board_statistics.loc[:, ~board_statistics.columns.str.contains('^Unnamed')]

# Final DataFrame Verification
print("\nMerged board_statistics DataFrame:")
print(board_statistics)

# Check for (and confirm removal of) unnamed columns
unnamed_columns = [col for col in board_statistics.columns if 'Unnamed' in col]
if unnamed_columns:
    print("\nUnnamed Columns Detected and Removed:")
    print(unnamed_columns)
else:
    print("\nNo Unnamed Columns Detected.")


board_statistics['control'] = board_statistics['control'].apply(
    lambda x: 'Private' if 'Private' in str(x) else 'Public' if 'Public' in str(x) else x
)



Merged board_statistics DataFrame:
      Year                      Institution  AffiliationId  female_president  \
0     1999               Adelphi University     71965598.0             False   
1     1999              American University    181401687.0             False   
2     1999               Andrews University    102298084.0             False   
3     1999         Arizona Board of Regents     55732556.0             False   
4     1999                Auburn University     82497590.0             False   
...    ...                              ...            ...               ...   
1381  2018      Western Michigan University    141649380.0             False   
1382  2018               Widener University    138659443.0              True   
1383  2018  Worcester Polytechnic Institute    107077323.0              True   
1384  2018                  Yale University     32971472.0             False   
1385  2018               Yeshiva University     19772626.0             False   

   

In [78]:
state_to_region = {
    # Northeast
    "CT": "Northeast", "ME": "Northeast", "MA": "Northeast", "NH": "Northeast",
    "RI": "Northeast", "VT": "Northeast", "NJ": "Northeast", "NY": "Northeast", 
    "PA": "Northeast", "DC": "Northeast",
    
    # Midwest
    "IL": "Midwest", "IN": "Midwest", "IA": "Midwest", "KS": "Midwest",
    "MI": "Midwest", "MN": "Midwest", "MO": "Midwest", "NE": "Midwest", 
    "ND": "Midwest", "OH": "Midwest", "SD": "Midwest", "WI": "Midwest",
    
    # South
    "AL": "South", "AR": "South", "DE": "South", "FL": "South",
    "GA": "South", "KY": "South", "LA": "South", "MD": "South",
    "MS": "South", "NC": "South", "OK": "South", "SC": "South",
    "TN": "South", "TX": "South", "VA": "South", "WV": "South",
    
    # West
    "AK": "West", "AZ": "West", "CA": "West", "CO": "West",
    "HI": "West", "ID": "West", "MT": "West", "NV": "West",
    "NM": "West", "OR": "West", "UT": "West", "WA": "West",
    "WY": "West"
}
# board_statistics["regoio"]
board_statistics['region'] = board_statistics['state'].map(state_to_region)
board_statistics.to_csv(file_path, index=False)


In [79]:
# Example dictionary: full state names to abbreviations.
state_to_abbreviation = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY"
}

def update_region_from_institution(row):
    """
    Scans the 'Institution' field for any full state name (case-insensitive).
    If found, uses state_to_abbreviation to get the state abbreviation and then
    state_to_region to get the region. If no match is found, returns the existing
    value of 'region' (or 'Unknown' if not present).
    """
    institution = str(row['Institution'])
    for full_state, abbrev in state_to_abbreviation.items():
        # Check if the full state name appears in the institution name.
        if full_state.lower() in institution.lower():
            # Return the region from the state_to_region mapping.
            return state_to_region.get(abbrev, "Unknown")
    # If no state name is found, return the current region value (or default to "Unknown")
    return row.get('region', "Unknown")

# Apply the function row-wise to update the 'region' column.
board_statistics['region'] = board_statistics.apply(update_region_from_institution, axis=1)

# Optionally, inspect the updated columns.
print(board_statistics[['Institution', 'region']].head())
board_statistics = board_statistics.drop_duplicates()
board_statistics.to_csv(file_path, index=False)


                Institution     region
0        Adelphi University  Northeast
1       American University  Northeast
2        Andrews University    Midwest
3  Arizona Board of Regents       West
4         Auburn University      South
