In [47]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [48]:
# Load the dataset
file_path = 'Most-Recent-Cohorts-Scorecard-Elements.csv'
dataset = pd.read_csv(file_path)

In [49]:
# Display basic information about the dataset
print("Initial dataset shape:", dataset.shape)

Initial dataset shape: (7703, 123)


In [50]:
dataset.head()

Unnamed: 0,index,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,INSTURL,NPCURL,HCM2,...,RET_PTL4,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GT_25K_P6,GRAD_DEBT_MDN_SUPP,GRAD_DEBT_MDN10YR_SUPP,RPY_3YR_RT_SUPP,C150_L4_POOLED_SUPP,C150_4_POOLED_SUPP
0,0,100654,100200,1002,Alabama A & M University,Normal,AL,www.aamu.edu/,galileo.aamu.edu/netpricecalculator/npcalc.htm,0,...,,0.8284,0.1049,30300,0.426,33888.0,347.789507913825,0.2370456303,,0.32451962616822
1,1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,www.uab.edu,www.collegeportraits.org/AL/UAB/estimator/agree,0,...,,0.5214,0.2422,39700,0.665,21941.5,225.183648722001,0.5006735519,,0.54610480182926
2,2,100690,2503400,25034,Amridge University,Montgomery,AL,www.amridgeuniversity.edu,www2.amridgeuniversity.edu:9091/,0,...,,0.7795,0.854,40100,0.676,23370.0,239.844216240146,0.2904884319,,PrivacySuppressed
3,3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,www.uah.edu,finaid.uah.edu/,0,...,,0.4596,0.264,45500,0.668,24097.0,247.305352106924,0.5278022948,,0.47240649606299
4,4,100724,100500,1005,Alabama State University,Montgomery,AL,www.alasu.edu,www.alasu.edu/cost-aid/forms/calculator/index....,0,...,,0.7554,0.127,26600,0.36,33118.5,339.892198354698,0.1855174624,,0.2574049608355


In [51]:
# Handle duplicates
dataset = dataset.drop_duplicates()
print("Dataset shape after removing duplicates:", dataset.shape)

Dataset shape after removing duplicates: (7703, 123)


In [52]:
# Replace 'PrivacySuppressed' with NaN in 'MD_EARN_WNE_P10' and 'GRAD_DEBT_MDN_SUPP'
if 'MD_EARN_WNE_P10' in dataset.columns:
    dataset['MD_EARN_WNE_P10'] = dataset['MD_EARN_WNE_P10'].replace('PrivacySuppressed', np.nan).astype(float)

if 'GRAD_DEBT_MDN_SUPP' in dataset.columns:
    dataset['GRAD_DEBT_MDN_SUPP'] = dataset['GRAD_DEBT_MDN_SUPP'].replace('PrivacySuppressed', np.nan).astype(float)

if 'RPY_3YR_RT_SUPP' in dataset.columns:
    dataset['RPY_3YR_RT_SUPP'] = dataset['RPY_3YR_RT_SUPP'].replace('PrivacySuppressed', np.nan).astype(float)

In [53]:
# Impute missing values
# For numerical columns
num_cols = ['PCTFLOAN', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
num_cols = [col for col in num_cols if col in dataset.columns]
if num_cols:
    imputer = SimpleImputer(strategy='median')
    dataset[num_cols] = imputer.fit_transform(dataset[num_cols])

In [54]:
# For categorical columns
cat_cols = ['INSTNM', 'CITY']
cat_cols = [col for col in cat_cols if col in dataset.columns]
if cat_cols:
    imputer = SimpleImputer(strategy='most_frequent')
    dataset[cat_cols] = imputer.fit_transform(dataset[cat_cols])

In [55]:
# Normalize numerical features
if num_cols:
    scaler = StandardScaler()
    dataset[num_cols] = scaler.fit_transform(dataset[num_cols])

In [56]:
# One-hot encode categorical variables
cat_cols = ['INSTNM', 'CITY', 'STABBR''INSTURL']
cat_cols = [col for col in cat_cols if col in dataset.columns]
if cat_cols:
    dataset = pd.get_dummies(dataset, columns=cat_cols, drop_first=True)


In [57]:
# Create derived features
if 'GRAD_DEBT_MDN_SUPP' in dataset.columns and 'MD_EARN_WNE_P10' in dataset.columns:
    dataset['debt_to_income_ratio'] = dataset['GRAD_DEBT_MDN_SUPP'] / dataset['MD_EARN_WNE_P10']


In [58]:
# Transform target variable
if 'RPY_3YR_RT_SUPP' in dataset.columns:
    threshold = dataset['RPY_3YR_RT_SUPP'].median()
    dataset['repayment_rate_high'] = (dataset['RPY_3YR_RT_SUPP'] >= threshold).astype(int)

In [59]:
# Save the preprocessed dataset
dataset.to_csv('preprocessed_scorecard_data.csv', index=False)

In [60]:
# Display the preprocessed dataset
dataset.head()

Unnamed: 0,index,UNITID,OPEID,OPEID6,STABBR,INSTURL,NPCURL,HCM2,PREDDEG,CONTROL,...,CITY_Youngwood,CITY_Ypsilanti,CITY_Yuba City,CITY_Yucaipa,CITY_Yucca Valley,CITY_Yukon,CITY_Yuma,CITY_Zanesville,debt_to_income_ratio,repayment_rate_high
0,0,100654,100200,1002,AL,www.aamu.edu/,galileo.aamu.edu/netpricecalculator/npcalc.htm,0,3,1,...,False,False,False,False,False,False,False,False,-14.876943,0
1,1,100663,105200,1052,AL,www.uab.edu,www.collegeportraits.org/AL/UAB/estimator/agree,0,3,1,...,False,False,False,False,False,False,False,False,1.25088,1
2,2,100690,2503400,25034,AL,www.amridgeuniversity.edu,www2.amridgeuniversity.edu:9091/,0,3,2,...,False,False,False,False,False,False,False,False,1.490541,0
3,3,100706,105500,1055,AL,www.uah.edu,finaid.uah.edu/,0,3,1,...,False,False,False,False,False,False,False,False,0.973952,1
4,4,100724,100500,1005,AL,www.alasu.edu,www.alasu.edu/cost-aid/forms/calculator/index....,0,3,1,...,False,False,False,False,False,False,False,False,-4.929777,0


In [61]:
dataset.shape

(7703, 10198)