In [1]:
from pathlib import Path
from typing import Tuple, List

import pandas as pd
import numpy as np

# Load Data

In [2]:
HERE = Path.cwd()

PROJECT_ROOT = HERE.parents[1]

DATA_RAW = PROJECT_ROOT / "data_files"
DATA_PROC = HERE / "processed_data"

In [3]:
def academic_to_calendar_year(val):
    if pd.isna(val):
        return pd.NA
    s = str(val).strip()
    if "-" not in s:
        try:
            return int(float(s))
        except ValueError:
            return pd.NA

    first, second = s.split("-", 1)
    try:
        first_int = int(first)
    except ValueError:
        return pd.NA

    sec_digits = "".join(ch for ch in second if ch.isdigit())
    if not sec_digits:
        return pd.NA
    if len(sec_digits) == 2:
        last2 = int(sec_digits)
        century = (first_int // 100) * 100
        return century + last2
    else:
        return int(sec_digits)

In [4]:
def normalize_cc(name: str):
    if pd.isna(name):
        return name

    s = str(name).upper().strip()

    s = s.replace(".", "")

    for token in [
        " COMMUNITY COLLEGE",
        " COMM COLLEGE",
        " CMTY",          
        " JR COLLEGE",
        " JUNIOR COLLEGE",
        " COLLEGE",
        "COLLEGE ",
        " CCD",
        " C.C.",
        " DISTRICT",
        " TOTAL",
    ]:
        s = s.replace(token, "")

    s = " ".join(s.split())

    cc_aliases = {
        "CHABOT HAYWARD": "CHABOT",

        "PASADENA": "PASADENA CITY",
        "RIVERSIDE": "RIVERSIDE CITY",
        "SAN BERNARDINO": "SAN BERNARDINO VALLEY",
        "SANTA BARBARA": "SANTA BARBARA CITY",
        "SANTA ROSA": "SANTA ROSA JUNIOR",

        "COALINGA": "WEST HILLS COALINGA",
        "LEMOORE": "WEST HILLS LEMOORE",
        "WEST LA": "WEST LOS ANGELES",

        "NAPA": "NAPA VALLEY",
        "LAS POSITAS": "LAS POSITAS", 
        "DEANZA": "DE ANZA",

        "OF THE SISKIYOUS": "SISKIYOUS",
        "OF THE SEQUOIAS": "SEQUOIAS",
        "OF THE REDWOODS": "REDWOODS",
        "OF THE DESERT": "DESERT",
        "OF THE CANYONS": "CANYONS",
        "OF SAN MATEO": "SAN MATEO",
        "OF MARIN": "MARIN",
        "OF ALAMEDA": "ALAMEDA",

        "SAN MATEO": "SAN MATEO",
        "MARIN": "MARIN",
        "ALAMEDA": "ALAMEDA",
        "SAN FRANCISCO": "CITY OF SAN FRANCISCO",
        "SAN FRANCISCO CTRS": "CITY OF SAN FRANCISCO",
        "CITY OF SAN FRANCISCO": "CITY OF SAN FRANCISCO",

        "MT SAN JACINTO": "MOUNT SAN JACINTO",
        "MT SAN ANTONIO": "MOUNT SAN ANTONIO",
       
        "EAST LA": "EAST LOS ANGELES",
        "IMPERIAL": "IMPERIAL VALLEY",
        "IRVINE": "IRVINE VALLEY",
        "LONG BEACH": "LONG BEACH CITY",

        "LA CITY": "LOS ANGELES CITY",
        "LA HARBOR": "LOS ANGELES HARBOR",
        "LA MISSION": "LOS ANGELES MISSION",
        "LA PIERCE": "LOS ANGELES PIERCE",
        "LA SWEST": "LOS ANGELES SOUTHWEST",
        "LA TRADE": "LOS ANGELES TRADE TECHNICAL",
        "LA VALLEY": "LOS ANGELES VALLEY",

        "MODESTO": "MODESTO JUNIOR",
        "MONTEREY": "MONTEREY PENINSULA",
    }

    s = cc_aliases.get(s, s)
    return s

In [5]:
def normalize_uc(name: str):
    if pd.isna(name):
        return name
    s = str(name).strip()

    mapping = {
        # Berkeley
        "University of California-Berkeley": "Berkeley",
        "UC Berkeley": "Berkeley",
        "UCB": "Berkeley",
        "Berkeley": "Berkeley",

        # Davis
        "University of California-Davis": "Davis",
        "UC Davis": "Davis",
        "UCD": "Davis",
        "Davis": "Davis",

        # Irvine
        "University of California-Irvine": "Irvine",
        "UC Irvine": "Irvine",
        "UCI": "Irvine",
        "Irvine": "Irvine",

        # Los Angeles
        "University of California-Los Angeles": "Los Angeles",
        "University of California, Los Angeles": "Los Angeles",
        "UC Los Angeles": "Los Angeles",
        "UCLA": "Los Angeles",
        "Los Angeles": "Los Angeles",

        # Merced
        "University of California-Merced": "Merced",
        "UC Merced": "Merced",
        "UCM": "Merced",
        "Merced": "Merced",

        # Riverside
        "University of California-Riverside": "Riverside",
        "UC Riverside": "Riverside",
        "UCR": "Riverside",
        "Riverside": "Riverside",

        # San Diego
        "University of California-San Diego": "San Diego",
        "UC San Diego": "San Diego",
        "UCSD": "San Diego",
        "San Diego": "San Diego",

        # Santa Barbara
        "University of California-Santa Barbara": "Santa Barbara",
        "UC Santa Barbara": "Santa Barbara",
        "UCSB": "Santa Barbara",
        "Santa Barbara": "Santa Barbara",

        # Santa Cruz
        "University of California-Santa Cruz": "Santa Cruz",
        "UC Santa Cruz": "Santa Cruz",
        "UCSC": "Santa Cruz",
        "Santa Cruz": "Santa Cruz",
    }

    return mapping.get(s, s)

# Data processing

## Scorecard features: CC + UC

In [6]:
cc_score_cols = [
    "year",
    "school.name",
    "aid.ftft_pell_grant_rate",
    "aid.ftft_federal_loan_rate",
    "aid.pell_grant_rate",
    "aid.federal_loan_rate",
    "student.enrollment.undergrad_12_month",
    "cost.attendance.academic_year",
]

uc_score_cols = cc_score_cols + ["admissions.admission_rate.overall"]

# CC scorecard
cc_raw = pd.read_csv(DATA_RAW / "cc_scorecard.csv", usecols=cc_score_cols)

cc_feat = (
    cc_raw.rename(
        columns={
            "school.name": "cc_name_raw",
            "aid.ftft_pell_grant_rate": "cc_ftft_pell_rate",
            "aid.ftft_federal_loan_rate": "cc_ftft_fedloan_rate",
            "aid.pell_grant_rate": "cc_pell_rate",
            "aid.federal_loan_rate": "cc_fedloan_rate",
            "student.enrollment.undergrad_12_month": "cc_ug_enroll_12m",
            "cost.attendance.academic_year": "cc_coa_ay",
        }
    )
    .assign(
        cc_name=lambda d: d["cc_name_raw"].apply(normalize_cc),
        year=lambda d: d["year"].apply(academic_to_calendar_year).astype("Int64"),
    )
    .drop(columns=["cc_name_raw"])
    .sort_values(["cc_name", "year"])
)

cc_feat.to_csv(DATA_PROC / "cc_scorecard_features.csv", index=False)

# UC scorecard
uc_raw = pd.read_csv(DATA_RAW / "uc_scorecard.csv", usecols=uc_score_cols)

uc_feat = (
    uc_raw.rename(
        columns={
            "school.name": "uc_name_full",
            "aid.ftft_pell_grant_rate": "uc_ftft_pell_rate",
            "aid.ftft_federal_loan_rate": "uc_ftft_fedloan_rate",
            "aid.pell_grant_rate": "uc_pell_rate",
            "aid.federal_loan_rate": "uc_fedloan_rate",
            "student.enrollment.undergrad_12_month": "uc_ug_enroll_12m",
            "cost.attendance.academic_year": "uc_coa_ay",
            "admissions.admission_rate.overall": "uc_admit_rate_overall",
        }
    )
    .assign(
        uc_campus=lambda d: d["uc_name_full"].apply(normalize_uc),
        year=lambda d: d["year"].apply(academic_to_calendar_year).astype("Int64"),
    )
    .drop(columns=["uc_name_full"])
    .sort_values(["uc_campus", "year"])
)

uc_feat.to_csv(DATA_PROC / "uc_scorecard_features.csv", index=False)

In [8]:
cc_feat.head()

Unnamed: 0,year,cc_ftft_pell_rate,cc_ftft_fedloan_rate,cc_pell_rate,cc_fedloan_rate,cc_ug_enroll_12m,cc_coa_ay,cc_name
31,2012,0.5779,0.0251,0.2038,0.0095,11043.0,11189.0,ALAMEDA
146,2013,0.6622,0.0068,0.2203,0.0111,11283.0,10251.0,ALAMEDA
261,2014,0.625,0.026,0.2273,0.0117,10844.0,10898.0,ALAMEDA
376,2015,0.5671,0.0087,0.2364,0.0091,10588.0,12265.0,ALAMEDA
491,2016,0.5289,0.0083,0.204,0.0081,10951.0,12038.0,ALAMEDA


In [7]:
uc_feat.head()

Unnamed: 0,year,uc_admit_rate_overall,uc_ftft_pell_rate,uc_ftft_fedloan_rate,uc_pell_rate,uc_fedloan_rate,uc_ug_enroll_12m,uc_coa_ay,uc_campus
0,2012,0.2161,0.2441,0.2897,0.333,0.3181,28774.0,32445.0,Berkeley
10,2013,0.18,0.2682,0.2898,0.3237,0.3021,28662.0,32715.0,Berkeley
20,2014,0.1602,0.241,0.2459,0.3143,0.2698,28919.0,33020.0,Berkeley
30,2015,0.1688,0.2622,0.2683,0.3114,0.2602,29326.0,33989.0,Berkeley
40,2016,0.1693,0.2299,0.2408,0.3031,0.26,29591.0,34924.0,Berkeley


## cc2uc_major

In [9]:
maj_cols = ["Year", "UC", "Field", "Major", "Enrolls", "CC"]
maj_raw = pd.read_csv(DATA_RAW / "cc2uc_major.csv", usecols=maj_cols)

maj_feat = (
    maj_raw.rename(
        columns={
            "Year": "year_acad",
            "UC": "uc_raw",
            "CC": "cc_name_raw",
            "Field": "field",
            "Major": "major",
            "Enrolls": "enrolls",
        }
    )
    .assign(
        year=lambda d: d["year_acad"].apply(academic_to_calendar_year).astype("Int64"),
        cc_name=lambda d: d["cc_name_raw"].apply(normalize_cc),
        uc_campus=lambda d: d["uc_raw"].apply(normalize_uc),
    )
    .drop(columns=["year_acad", "cc_name_raw", "uc_raw"])
)

maj_feat.to_csv(DATA_PROC / "cc2uc_major_features.csv", index=False)
maj_feat.head()

Unnamed: 0,field,major,enrolls,year,cc_name,uc_campus
0,BIOLOGICAL AND BIOMEDICAL SCIENCES,"Biology, General",3,2013,ALLAN HANCOCK,Santa Barbara
1,"COMMUNICATION, JOURNALISM, AND RELATED PROGRAMS",Communication and Media Studies,3,2013,ALLAN HANCOCK,Santa Barbara
2,ENGLISH LANGUAGE AND LITERATURE/LETTERS,"English Language and Literature, General",3,2013,ALLAN HANCOCK,Santa Barbara
3,"FOREIGN LANGUAGES, LITERATURES, AND LINGUISTICS","Linguistic, Comparative, and Related Language ...",3,2013,ALLAN HANCOCK,Santa Cruz
4,PSYCHOLOGY,"Psychology, General",6,2013,ALLAN HANCOCK,Santa Barbara


In [10]:

maj_summary = (
    maj_feat.groupby(["cc_name", "uc_campus", "year"], as_index=False)
    .agg(total_major_enrolls=("enrolls", "sum"))
)

maj_summary.to_csv(DATA_PROC / "cc2uc_major_summary.csv", index=False)
maj_summary.head()

Unnamed: 0,cc_name,uc_campus,year,total_major_enrolls
0,ALAMEDA,Berkeley,2014,7
1,ALAMEDA,Berkeley,2017,5
2,ALAMEDA,Berkeley,2018,6
3,ALAMEDA,Berkeley,2019,4
4,ALAMEDA,Berkeley,2020,12


## cc2uc_3status：gender + ethnicity

In [11]:
status_frames = []
status_specs = [
    ("cc2uc_3status_gnd.csv", "Gender", "gender"),
    ("cc2uc_3status_eth.csv", "Ethnicity", "ethnicity"),
]

for fname, group_col, group_type in status_specs:
    df_raw = pd.read_csv(DATA_RAW / fname)

    df_clean = (
        df_raw.rename(
            columns={
                "City": "cc_city",
                "County": "cc_county",
                "School": "cc_name_raw",
                "UC": "uc_raw",
                "Year": "year_acad",
                "Count": "scope",     # App / Adm / Enr
                "Num": "n_students",
                group_col: "group_value",
            }
        )
        .assign(
            group_type=group_type,
            cc_name=lambda d: d["cc_name_raw"].apply(normalize_cc),
            uc_campus=lambda d: d["uc_raw"].apply(normalize_uc),
            year=lambda d: d["year_acad"].apply(academic_to_calendar_year).astype("Int64"),
        )
        .loc[
            :,
            [
                "cc_city",
                "cc_county",
                "cc_name",
                "uc_campus",
                "year",
                "scope",
                "n_students",
                "group_type",
                "group_value",
            ],
        ]
    )

    status_frames.append(df_clean)

status_long = pd.concat(status_frames, ignore_index=True)

gender_enr = (
    status_long
    .query("group_type == 'gender' and scope == 'Enr' and group_value != 'All'")
    .groupby(["cc_name", "uc_campus", "year", "group_value"], as_index=False)
    .agg(n_enroll=("n_students", "sum"))
)

gender_wide = gender_enr.pivot(
    index=["cc_name", "uc_campus", "year"],
    columns="group_value",
    values="n_enroll",
).reset_index()

gender_cols = [c for c in gender_wide.columns if c not in ["cc_name", "uc_campus", "year"]]

for c in gender_cols:
    gender_wide[c] = pd.to_numeric(gender_wide[c], errors="coerce")

gender_wide["gender_total_enr"] = gender_wide[gender_cols].sum(axis=1)

for c in gender_cols:
    col_safe = str(c).lower().replace(" ", "_")
    gender_wide[f"share_gender_{col_safe}"] = gender_wide[c] / gender_wide["gender_total_enr"]

gender_feat = gender_wide[
    ["cc_name", "uc_campus", "year"]
    + [c for c in gender_wide.columns if c.startswith("share_gender_")]
]

share_gender_cols = [c for c in gender_feat.columns if c.startswith("share_gender_")]
gender_feat[share_gender_cols] = gender_feat[share_gender_cols].fillna(0)

gender_feat.to_csv(DATA_PROC / "cc2uc_gender_features.csv", index=False)
gender_feat.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_feat[share_gender_cols] = gender_feat[share_gender_cols].fillna(0)


group_value,cc_name,uc_campus,year,share_gender_female,share_gender_male,share_gender_other,share_gender_unknown
0,ALAMEDA,Berkeley,2006,0.52,0.48,0.0,0.0
1,ALAMEDA,Berkeley,2007,0.47619,0.52381,0.0,0.0
2,ALAMEDA,Berkeley,2008,0.631579,0.368421,0.0,0.0
3,ALAMEDA,Berkeley,2009,0.777778,0.222222,0.0,0.0
4,ALAMEDA,Berkeley,2010,0.5625,0.4375,0.0,0.0


In [12]:
eth_enr = (
    status_long
    .query("group_type == 'ethnicity' and scope == 'Enr' and group_value != 'All'")
    .groupby(["cc_name", "uc_campus", "year", "group_value"], as_index=False)
    .agg(n_enroll=("n_students", "sum"))
)

eth_wide = eth_enr.pivot(
    index=["cc_name", "uc_campus", "year"],
    columns="group_value",
    values="n_enroll",
).reset_index()

eth_cols = [c for c in eth_wide.columns if c not in ["cc_name", "uc_campus", "year"]]

for c in eth_cols:
    eth_wide[c] = pd.to_numeric(eth_wide[c], errors="coerce")

eth_wide["eth_total_enr"] = eth_wide[eth_cols].sum(axis=1)

for c in eth_cols:
    col_safe = (
        str(c)
        .lower()
        .replace(" ", "_")
        .replace("/", "_")
    )
    eth_wide[f"share_eth_{col_safe}"] = eth_wide[c] / eth_wide["eth_total_enr"]

eth_feat = eth_wide[
    ["cc_name", "uc_campus", "year"]
    + [c for c in eth_wide.columns if c.startswith("share_eth_")]
]

share_eth_cols = [c for c in eth_feat.columns if c.startswith("share_eth_")]
eth_feat[share_eth_cols] = eth_feat[share_eth_cols].fillna(0)

eth_feat.to_csv(DATA_PROC / "cc2uc_ethnicity_features.csv", index=False)
eth_feat.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eth_feat[share_eth_cols] = eth_feat[share_eth_cols].fillna(0)


group_value,cc_name,uc_campus,year,share_eth_african_american,share_eth_american_indian,share_eth_asian,share_eth_domestic_unknown,share_eth_hispanic__latinx,share_eth_int'l,share_eth_pacific_islander,share_eth_white
0,ALAMEDA,Berkeley,2006,0.16,0.0,0.36,0.12,0.16,0.04,0.0,0.16
1,ALAMEDA,Berkeley,2007,0.157895,0.0,0.631579,0.0,0.105263,0.0,0.0,0.105263
2,ALAMEDA,Berkeley,2008,0.105263,0.0,0.210526,0.263158,0.157895,0.0,0.0,0.263158
3,ALAMEDA,Berkeley,2009,0.0,0.0,0.3125,0.125,0.25,0.0,0.0,0.3125
4,ALAMEDA,Berkeley,2010,0.375,0.0,0.25,0.0625,0.0,0.0625,0.0,0.25


## Melt（App / Adm / Enr）

In [14]:
eth_raw = pd.read_csv(DATA_RAW / "cc2uc_3status_eth.csv")

eth_clean = (
    eth_raw.rename(
        columns={
            "City": "cc_city",
            "County": "cc_county",
            "School": "cc_name_raw",   
            "UC": "uc_raw",           
            "Year": "year_acad",       
            "Count": "scope",          # App / Adm / Enr
            "Num": "n_students",      
            "Ethnicity": "ethnicity",
        }
    )
    .assign(
        cc_name=lambda d: d["cc_name_raw"].apply(normalize_cc),
        uc_campus=lambda d: d["uc_raw"].apply(normalize_uc),
        year=lambda d: d["year_acad"].apply(academic_to_calendar_year).astype("Int64"),
    )
)

eth_all = eth_clean.query("ethnicity == 'All'").copy()

overall = (
    eth_all
    .groupby(["cc_name", "uc_campus", "year", "scope"], as_index=False)
    .agg(n_students=("n_students", "sum"))
)

melt = (
    overall.pivot(
        index=["cc_name", "uc_campus", "year"],
        columns="scope",
        values="n_students",
    )
    .reset_index()
    .rename(
        columns={
            "App": "n_app",
            "Adm": "n_admit",
            "Enr": "n_enroll",
        }
    )
)

for col in ["n_app", "n_admit", "n_enroll"]:
    melt[col] = pd.to_numeric(melt[col], errors="coerce")

melt = melt[melt["n_admit"].notna() & (melt["n_admit"] > 0)]

melt["melt_count"] = melt["n_admit"] - melt["n_enroll"]
melt["melt_rate"] = melt["melt_count"] / melt["n_admit"]

melt.to_csv(DATA_PROC / "cc2uc_melt_overall.csv", index=False)

In [15]:
melt.head()

scope,cc_name,uc_campus,year,n_admit,n_app,n_enroll,melt_count,melt_rate
0,ALAMEDA,Berkeley,2006,28.0,79.0,25.0,3.0,0.107143
1,ALAMEDA,Berkeley,2007,30.0,67.0,21.0,9.0,0.3
2,ALAMEDA,Berkeley,2008,21.0,62.0,19.0,2.0,0.095238
3,ALAMEDA,Berkeley,2009,22.0,52.0,18.0,4.0,0.181818
4,ALAMEDA,Berkeley,2010,19.0,62.0,16.0,3.0,0.157895


## cc_uc_drive_distance

In [17]:
dist_raw = pd.read_csv(DATA_RAW / "cc_uc_drive_distances.csv")

dist_feat = (
    dist_raw
    .rename(columns={
        "cc_name": "cc_name_raw",
        "uc_name": "uc_name_full",
    })
    .assign(
        cc_name=lambda d: d["cc_name_raw"].apply(normalize_cc),
        uc_campus=lambda d: d["uc_name_full"].apply(normalize_uc),
    )
    .loc[
        :,
        ["cc_name", "uc_campus", "distance_miles", "duration_hours"],
    ]
    .rename(
        columns={
            "distance_miles": "cc_uc_distance_miles",
            "duration_hours": "cc_uc_drive_hours",
        }
    )
)

dist_feat.to_csv(DATA_PROC / "cc_uc_distance_features.csv", index=False)

In [18]:
dist_feat.head()

Unnamed: 0,cc_name,uc_campus,cc_uc_distance_miles,cc_uc_drive_hours
0,ALLAN HANCOCK,Berkeley,264.32,4.18
1,ALLAN HANCOCK,Davis,317.68,4.88
2,ALLAN HANCOCK,Irvine,198.49,3.27
3,ALLAN HANCOCK,Los Angeles,162.75,2.72
4,ALLAN HANCOCK,Merced,230.16,3.83


## StudentCitizenshipStatus

In [19]:
cit_raw = pd.read_csv(DATA_RAW / "StudentCitizenshipStatus.csv", encoding="latin1")

cit_raw = cit_raw.rename(columns={"Unnamed: 0": "area", "Unnamed: 1": "status"})
cit_raw = cit_raw.iloc[1:].reset_index(drop=True)

cit_raw["area_filled"] = cit_raw["area"].ffill()

cit_raw["cc_name"] = cit_raw["area_filled"].apply(normalize_cc)

cit_sel = cit_raw[cit_raw["status"].astype(str).str.contains("Permanent Resident", na=False)].copy()

count_cols = [
    c
    for c in cit_sel.columns
    if isinstance(c, str)
    and c.startswith("Fall ")
    and ".1" not in c       
]

cit_long = cit_sel.melt(
    id_vars=["cc_name"],
    value_vars=count_cols,
    var_name="term",
    value_name="count_raw",
)

cit_long["year"] = cit_long["term"].str.extract(r"(\d{4})")[0].astype("Int64")

cit_long["count"] = (
    cit_long["count_raw"]
    .astype(str)
    .str.replace(",", "", regex=False)
)
cit_long["count"] = pd.to_numeric(cit_long["count"], errors="coerce")

cit_long = cit_long.dropna(subset=["year", "count"])

perm_feat = (
    cit_long
    .groupby(["cc_name", "year"], as_index=False)
    .agg(cc_perm_resident_count=("count", "sum"))
)

perm_feat.to_csv(DATA_PROC / "cc_perm_resident_by_year.csv", index=False)

In [20]:
perm_feat.head()

Unnamed: 0,cc_name,year,cc_perm_resident_count
0,ALAMEDA,2004,1124.0
1,ALAMEDA,2005,1075.0
2,ALAMEDA,2006,1137.0
3,ALAMEDA,2007,1244.0
4,ALAMEDA,2008,1089.0


## StudentFinAidSumm

In [22]:
aid_raw = pd.read_csv(DATA_RAW / "StudentFinAidSumm.csv", encoding="latin1")

aid_raw = aid_raw.rename(columns={"Unnamed: 0": "area", "Unnamed: 1": "label"})

aid_raw["area_filled"] = aid_raw["area"].ffill()

aid_raw["cc_name"] = (
    aid_raw["area_filled"]
    .str.replace(" Total", "", regex=False)
    .apply(normalize_cc)     
)

wanted_labels = [
    "California College Promise Grant Total",
    "Grants Total",
    "Loans Total",
    "Scholarship Total",
]
aid_sel = aid_raw[aid_raw["label"].isin(wanted_labels)].copy()

amount_cols = [
    c for c in aid_sel.columns
    if isinstance(c, str) and c.startswith("Annual ") and c.endswith(".2")
]

aid_long = aid_sel.melt(
    id_vars=["cc_name", "label"],
    value_vars=amount_cols,
    var_name="year_col",
    value_name="amount_raw",
)

aid_long["year_acad"] = aid_long["year_col"].str.extract(r"Annual (\d{4}-\d{4})")[0]
aid_long["year"] = aid_long["year_acad"].apply(academic_to_calendar_year).astype("Int64")

aid_long["amount"] = (
    aid_long["amount_raw"]
    .astype(str)
    .str.replace(r"[\$,]", "", regex=True)
)
aid_long["amount"] = pd.to_numeric(aid_long["amount"], errors="coerce")

aid_long = aid_long.dropna(subset=["year", "amount"])

aid_feat = aid_long.pivot_table(
    index=["cc_name", "year"],
    columns="label",
    values="amount",
    aggfunc="sum",
).reset_index()

aid_feat = aid_feat.rename(
    columns={
        "California College Promise Grant Total": "cc_aid_promise_amt",
        "Grants Total": "cc_aid_grants_amt",
        "Loans Total": "cc_aid_loans_amt",
        "Scholarship Total": "cc_aid_scholarship_amt",
    }
)

aid_feat.to_csv(DATA_PROC / "cc_aid_features_by_year.csv", index=False)

In [23]:
aid_feat.head()

label,cc_name,year,cc_aid_promise_amt,cc_aid_grants_amt,cc_aid_loans_amt,cc_aid_scholarship_amt
0,ALAMEDA,2005,1085696.0,3177157.0,100801.0,19591.0
1,ALAMEDA,2006,1204515.0,3122790.0,112840.0,16721.0
2,ALAMEDA,2007,1099959.0,3058611.0,101014.0,9676.0
3,ALAMEDA,2008,914730.0,3476723.0,66825.0,27819.0
4,ALAMEDA,2009,1148620.0,3745699.0,231350.0,5500.0


# Merge + Handle missing data

In [24]:
panel = melt.copy()

panel = panel.merge(cc_feat, on=["cc_name", "year"], how="left")
panel = panel.merge(uc_feat, on=["uc_campus", "year"], how="left")
panel = panel.merge(maj_summary, on=["cc_name", "uc_campus", "year"], how="left")
panel = panel.merge(gender_feat, on=["cc_name", "uc_campus", "year"], how="left")
panel = panel.merge(eth_feat, on=["cc_name", "uc_campus", "year"], how="left")


panel = panel.merge(
    dist_feat,
    on=["cc_name", "uc_campus"],
    how="left",
)

panel = panel.merge(
    aid_feat[
        [
            "cc_name",
            "year",
            "cc_aid_promise_amt",
            "cc_aid_grants_amt",
            "cc_aid_loans_amt",
            "cc_aid_scholarship_amt",
        ]
    ],
    on=["cc_name", "year"],
    how="left",
)

panel = panel.merge(
    perm_feat[
        [
            "cc_name",
            "year",
            "cc_perm_resident_count",
        ]
    ],
    on=["cc_name", "year"],
    how="left",
)

cols_to_drop = [
    "cc_ftft_pell_rate",
    "cc_ftft_fedloan_rate",
    "cc_pell_rate",
    "cc_fedloan_rate",
    "uc_admit_rate_overall",
    "uc_ftft_pell_rate",
    "uc_ftft_fedloan_rate",
    "uc_pell_rate",
    "uc_fedloan_rate",
    "share_gender_other",
    "share_gender_unknown",
    "share_eth_domestic_unknown",
    "total_major_enrolls",
    "cc_uc_distance_miles",
    ]
panel = panel.drop(columns=cols_to_drop, errors="ignore")

# imputations
#cc_cols = ["cc_ug_enroll_12m", "cc_coa_ay"]

# for col in cc_cols:
#      if col in panel.columns:
#         panel[col] = (
#             panel
#             .groupby("cc_name")[col]
#             .transform(lambda s: s.fillna(s.median()))
#         )
#         panel[col] = panel[col].fillna(panel[col].median())

def backcast_uc_with_trend(panel: pd.DataFrame, col: str) -> pd.DataFrame:
    df = panel.copy()

    for campus, grp in df.groupby("uc_campus"):
        grp = grp.sort_values("year")
        years = grp["year"].to_numpy()
        values = grp[col].to_numpy(dtype="float64")

        series = pd.Series(values, index=years)
        series_interp = series.interpolate()

        mask_obs = series_interp.notna().to_numpy()

        x = years[mask_obs].astype(float)
        y = series_interp.to_numpy()[mask_obs].astype(float)

        a, b = np.polyfit(x, y, 1)
        filled = series_interp.to_numpy().copy()

        year_min_obs = x.min()
        early_mask = years < year_min_obs
        if early_mask.any():
            filled[early_mask] = a * years[early_mask] + b

        df.loc[grp.index, col] = filled

    return df
for col in ["uc_ug_enroll_12m", "uc_coa_ay"]:
    panel = backcast_uc_with_trend(panel, col)

def backcast_cc_with_trend(panel: pd.DataFrame, col: str) -> pd.DataFrame:
    df = panel.copy()

    for campus, grp in df.groupby("cc_name"):
        grp = grp.sort_values("year")
        years = grp["year"].to_numpy()
        values = grp[col].to_numpy(dtype="float64")

        series = pd.Series(values, index=years)
        series_interp = series.interpolate()

        mask_obs = series_interp.notna().to_numpy()

        x = years[mask_obs].astype(float)
        y = series_interp.to_numpy()[mask_obs].astype(float)

        a, b = np.polyfit(x, y, 1)
        filled = series_interp.to_numpy().copy()

        year_min_obs = x.min()
        early_mask = years < year_min_obs
        if early_mask.any():
            filled[early_mask] = a * years[early_mask] + b

        df.loc[grp.index, col] = filled

    return df

for col in ["cc_ug_enroll_12m", "cc_coa_ay"]:
    panel = backcast_cc_with_trend(panel, col)

for col in ["cc_aid_loans_amt", "cc_aid_scholarship_amt"]:
    if col in panel.columns:
        panel[col] = panel[col].fillna(0)


panel["cc_perm_resident_share"] = (
    panel["cc_perm_resident_count"] / panel["cc_ug_enroll_12m"]
)
cols_to_drop = [
    "cc_perm_resident_count",
]
panel = panel.drop(columns=cols_to_drop, errors="ignore")

panel.to_csv(DATA_PROC / "melt_panel_master.csv", index=False)


In [25]:
panel.head()

Unnamed: 0,cc_name,uc_campus,year,n_admit,n_app,n_enroll,melt_count,melt_rate,cc_ug_enroll_12m,cc_coa_ay,...,share_eth_hispanic__latinx,share_eth_int'l,share_eth_pacific_islander,share_eth_white,cc_uc_drive_hours,cc_aid_promise_amt,cc_aid_grants_amt,cc_aid_loans_amt,cc_aid_scholarship_amt,cc_perm_resident_share
0,ALAMEDA,Berkeley,2006,28.0,79.0,25.0,3.0,0.107143,12289.967033,5888.89011,...,0.16,0.04,0.0,0.16,0.39,1204515.0,3122790.0,112840.0,16721.0,0.092514
1,ALAMEDA,Berkeley,2007,30.0,67.0,21.0,9.0,0.3,12124.71978,6601.950549,...,0.105263,0.0,0.0,0.105263,0.39,1099959.0,3058611.0,101014.0,9676.0,0.1026
2,ALAMEDA,Berkeley,2008,21.0,62.0,19.0,2.0,0.095238,11959.472527,7315.010989,...,0.157895,0.0,0.0,0.263158,0.39,914730.0,3476723.0,66825.0,27819.0,0.091058
3,ALAMEDA,Berkeley,2009,22.0,52.0,18.0,4.0,0.181818,11794.225275,8028.071429,...,0.25,0.0,0.0,0.3125,0.39,1148620.0,3745699.0,231350.0,5500.0,0.096573
4,ALAMEDA,Berkeley,2010,19.0,62.0,16.0,3.0,0.157895,11628.978022,8741.131868,...,0.0,0.0625,0.0,0.25,0.39,1469319.0,5055887.0,277249.0,51078.0,0.084272


In [60]:
cc = "ALAMEDA"

cc_yearly = (
    panel
    .loc[
        (panel["cc_name"] == cc),
        ["cc_name", "year", "cc_ug_enroll_12m", "cc_coa_ay"]
    ]
    
    .drop_duplicates(subset=["cc_name", "year"])
)

cc_yearly


Unnamed: 0,cc_name,year,cc_ug_enroll_12m,cc_coa_ay
0,ALAMEDA,2006,12289.967033,5888.89011
1,ALAMEDA,2007,12124.71978,6601.950549
2,ALAMEDA,2008,11959.472527,7315.010989
3,ALAMEDA,2009,11794.225275,8028.071429
4,ALAMEDA,2010,11628.978022,8741.131868
5,ALAMEDA,2011,11463.730769,9454.192308
6,ALAMEDA,2012,11043.0,11189.0
7,ALAMEDA,2013,11283.0,10251.0
8,ALAMEDA,2014,10844.0,10898.0
9,ALAMEDA,2015,10588.0,12265.0


In [42]:
cc_panel

Unnamed: 0,cc_name,uc_campus,year,n_admit,n_app,n_enroll,melt_count,melt_rate,cc_ug_enroll_12m,cc_coa_ay,...,share_eth_hispanic__latinx,share_eth_int'l,share_eth_pacific_islander,share_eth_white,cc_uc_drive_hours,cc_aid_promise_amt,cc_aid_grants_amt,cc_aid_loans_amt,cc_aid_scholarship_amt,cc_perm_resident_share
0,ALAMEDA,Berkeley,2006,28.0,79.0,25.0,3.0,0.107143,10581.5,14141.0,...,0.160,0.040,0.0,0.160000,0.39,1204515.0,3122790.0,112840.0,16721.0,0.107452
19,ALAMEDA,Davis,2006,37.0,56.0,20.0,17.0,0.459459,10581.5,14141.0,...,0.050,0.150,0.0,0.000000,1.21,1204515.0,3122790.0,112840.0,16721.0,0.107452
38,ALAMEDA,Irvine,2006,11.0,18.0,0.0,11.0,1.000000,10581.5,14141.0,...,0.000,0.000,0.0,0.000000,6.62,1204515.0,3122790.0,112840.0,16721.0,0.107452
57,ALAMEDA,Los Angeles,2006,18.0,48.0,6.0,12.0,0.666667,10581.5,14141.0,...,0.000,0.000,0.0,0.500000,5.99,1204515.0,3122790.0,112840.0,16721.0,0.107452
76,ALAMEDA,Merced,2006,12.0,14.0,1.0,11.0,0.916667,10581.5,14141.0,...,0.000,0.000,0.0,0.000000,2.25,1204515.0,3122790.0,112840.0,16721.0,0.107452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,ALAMEDA,Merced,2024,13.0,18.0,1.0,12.0,0.923077,10581.5,14141.0,...,1.000,0.000,0.0,0.000000,2.25,1875604.0,6473072.0,298658.0,27313.0,0.054340
113,ALAMEDA,Riverside,2024,20.0,29.0,0.0,20.0,1.000000,10581.5,14141.0,...,0.000,0.000,0.0,0.000000,6.84,1875604.0,6473072.0,298658.0,27313.0,0.054340
132,ALAMEDA,San Diego,2024,44.0,53.0,17.0,27.0,0.613636,10581.5,14141.0,...,0.200,0.400,0.0,0.066667,8.01,1875604.0,6473072.0,298658.0,27313.0,0.054340
151,ALAMEDA,Santa Barbara,2024,32.0,45.0,3.0,29.0,0.906250,10581.5,14141.0,...,0.000,0.500,0.0,0.000000,5.19,1875604.0,6473072.0,298658.0,27313.0,0.054340


In [61]:
panel.shape

(17654, 27)

In [34]:
panel.columns

Index(['cc_name', 'uc_campus', 'year', 'n_admit', 'n_app', 'n_enroll',
       'melt_count', 'melt_rate', 'cc_ug_enroll_12m', 'cc_coa_ay',
       'uc_ug_enroll_12m', 'uc_coa_ay', 'share_gender_female',
       'share_gender_male', 'share_eth_african_american',
       'share_eth_american_indian', 'share_eth_asian',
       'share_eth_hispanic__latinx', 'share_eth_int'l',
       'share_eth_pacific_islander', 'share_eth_white', 'cc_uc_drive_hours',
       'cc_aid_promise_amt', 'cc_aid_grants_amt', 'cc_aid_loans_amt',
       'cc_aid_scholarship_amt', 'cc_perm_resident_share'],
      dtype='object')