In [18]:
import pandas as pd
import os

In [19]:
data_path = r"C:\Projects\connecteddatahub\data"

state_system_df = pd.read_csv(os.path.join(data_path, 'maps', 'system_mapping.csv'))
affiliation_df = pd.read_csv(os.path.join(data_path, 'maps', 'affiliation.csv'))
system_mapping_df = pd.read_csv(os.path.join(data_path, 'maps', 'affiliation_systems.csv'))


In [20]:
'''
Some of the state systems already have an ID.
Fill in the ones without one with an easily identifiable arbitrary ID (4 digits)
For these ones without id, we had to match by institution, but they all worked
'''
nan_ids = state_system_df['AffiliationId'].isna()
num_missing = nan_ids.sum()

state_system_df.loc[nan_ids, 'AffiliationId'] = range(1000, 1000 + num_missing)
print(state_system_df.loc[nan_ids, 'StateSystem'])
state_system_df['FullName'] = state_system_df['StateSystem']

affiliation_df = affiliation_df.merge(
    state_system_df[['FullName', 'AffiliationId']],
    on='FullName',
    how='left',
    suffixes=('', '_from_state')
)

affiliation_df['AffiliationId'] = affiliation_df['AffiliationId'].fillna(affiliation_df['AffiliationId_from_state'])
affiliation_df.drop(columns=['AffiliationId_from_state'], inplace=True)

0                      Arizona Board of Regents
5                     Indiana University System
9     Oklahoma State System of Higher Education
16                   Tennessee Board of Regents
Name: StateSystem, dtype: object


In [21]:
#mark the state systems as primary samples so we dont drop them later

primary_mask = affiliation_df['AffiliationId'].isin(state_system_df['AffiliationId'])
affiliation_df.loc[primary_mask, 'PrimarySample'] = True
state_system_mapping = state_system_df.set_index('AffiliationId')['StateSystem']
affiliation_df.loc[primary_mask, 'StateSystem'] = affiliation_df.loc[primary_mask, 'AffiliationId'].map(state_system_mapping)

In [22]:
#create state system to id mapping
system_affiliation_dict = (affiliation_df.dropna(subset=['StateSystem']).set_index('StateSystem')['AffiliationId'].to_dict())
print(system_affiliation_dict)

{'Arizona Board of Regents': 1000.0, 'California State University System': 127339247.0, 'City University of New York': 174216632.0, 'Colorado State University System': 4210131357.0, 'Idaho State Board of Education': 4210165361.0, 'Indiana University System': 1001.0, 'Kansas Board of Regents': 2801365651.0, 'Mississippi Institutions of Higher Learning': 4210141039.0, 'North Dakota University System': 4210127926.0, 'Oklahoma State System of Higher Education': 1002.0, 'Pennsylvania State System of Higher Education': 29957033.0, 'State University of New York System': 1327163397.0, 'State University System of Florida': 2801649442.0, 'Texas A&M University System': 173268674.0, 'Texas State University System': 2801273398.0, 'Texas Tech University System': 4210088475.0, 'Tennessee Board of Regents': 1003.0, 'Nevada System of Higher Education': 68260882.0, 'University of Alabama System': 2800507078.0, 'University of Alaska System': 2802090665.0, 'University of Arkansas System': 2799691083.0, 'U

In [23]:
#Mark which institutions are parts of a state system
system_mapping_df.dropna(subset=['AffiliationId'], inplace=True)
system_mapping_dict = system_mapping_df.set_index('AffiliationId')['StateSystem'].to_dict()
affiliation_df['StateSystem'] = affiliation_df['AffiliationId'].map(system_mapping_dict)

In [24]:
#map each state system and institution that is part of a state system to their system affiliation id, then drop
#the state system column because we aren't going to match with this
affiliation_df['SystemId'] = affiliation_df['StateSystem'].map(system_affiliation_dict)
affiliation_df['SystemId'] = affiliation_df['SystemId'].fillna(affiliation_df['FullName'].map(system_affiliation_dict))

affiliation_df.drop(columns=['StateSystem'], inplace=True)


In [25]:
# 1) Create mask to keep rows where Primary Sample is True OR SystemId is non-empty
mask = (
    (affiliation_df['PrimarySample'] == True)
    | (affiliation_df['SystemId'].notna() & (affiliation_df['SystemId'] != ''))
)

# 2) Filter and work on a copy
filtered = affiliation_df.loc[mask].copy()

# 3) Within each AffiliationId group, propagate any existing SystemId to all rows
filtered['SystemId'] = (
    filtered
      .groupby('AffiliationId')['SystemId']
      .transform(lambda x: x.ffill().bfill())
)

# 4) Finally, write out
filtered.to_csv(
    os.path.join(data_path, 'maps', 'cleaned_affiliation.csv'),
    index=False
)
