In [257]:
# %pip install lightgbm

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import re


# Data Clean

In [258]:
def load_fin_aid_clean():
    raw = pd.read_csv("../data_files/StudentFinAidSumm.csv", header=None, encoding="latin1")

    header_year = raw.iloc[0]
    header_metric = raw.iloc[1]

    data = raw.iloc[2:].reset_index(drop=True)

    col_names = []
    n_cols = raw.shape[1]

    for j in range(n_cols):
        if j < 3:
            col_names.append(f"level{j+1}")
        else:
            year_label = str(header_year[j])
            metric_label = str(header_metric[j])
            m = re.search(r"(\d{4}-\d{4})", year_label)
            year = m.group(1) if m else year_label

            metric = metric_label.strip()
            col_names.append(f"{year}|{metric}")

    data.columns = col_names

    id_cols = ["level1", "level2", "level3"]
    data[id_cols] = data[id_cols].ffill()
    

    value_cols = [c for c in data.columns if c not in id_cols]

    long = data.melt(
        id_vars=id_cols,
        value_vars=value_cols,
        var_name="year_metric",
        value_name="value",
    )

    long[["year", "metric"]] = long["year_metric"].str.split("|", n=1, expand=True)
    

    tidy = (
        long
        .pivot_table(
            index=id_cols + ["year"],
            columns="metric",
            values="value",
            aggfunc="first",
        )
        .reset_index()
    )

    tidy.columns.name = None

    if "Aid Amount" in tidy.columns:
        tidy["Aid Amount"] = (
            tidy["Aid Amount"]
            .astype(str)
            .str.replace(r"[$,]", "", regex=True)
            .replace("nan", pd.NA)
            .astype(float)
        )

    for col in ["Student Count", "Award Count"]:
        if col in tidy.columns:
            tidy[col] = pd.to_numeric(tidy[col], errors="coerce")

    tidy["year"] = tidy["year"].str[:4].astype(int)
    tidy['level2'] = tidy['level2'].apply(lambda x: x[:-6])
    
    return tidy

def load_citizenship_clean():
    raw = pd.read_csv("../data_files/StudentCitizenshipStatus.csv", header=None, encoding="latin1")

    header_term = raw.iloc[0]
    header_metric = raw.iloc[1]

    data = raw.iloc[2:].reset_index(drop=True)

    col_names = []
    n_cols = raw.shape[1]

    for j in range(n_cols):
        if j == 0:
            col_names.append("level1")
        elif j == 1:
            col_names.append("level2")
        else:
            term = str(header_term[j]).strip()
            metric = str(header_metric[j]).strip()
            col_names.append(f"{term}|{metric}")

    data.columns = col_names

    data[["level1", "level2"]] = data[["level1", "level2"]].ffill()

    value_cols = [c for c in data.columns if c not in ["level1", "level2"]]

    long = data.melt(
        id_vars=["level1", "level2"],
        value_vars=value_cols,
        var_name="term_metric",
        value_name="value",
    )

    long[["term", "metric"]] = long["term_metric"].str.split("|", n=1, expand=True)

    tidy = (
        long
        .pivot_table(
            index=["level1", "level2", "term"],
            columns="metric",
            values="value",
            aggfunc="first",
        )
        .reset_index()
    )

    tidy.columns.name = None

    if "Student Count" in tidy.columns:
        tidy["Student Count"] = (
            tidy["Student Count"]
            .astype(str)
            .str.replace(",", "", regex=False)
            .replace("nan", np.nan)
            .astype(float)
        )

    if "Student Count (%)" in tidy.columns:
        tidy["Student Count (%)"] = (
            pd.to_numeric(
                tidy["Student Count (%)"]
                .astype(str)
                .str.rstrip("%"),
                errors="coerce",
            ) / 100.0
        )

    tidy["year"] = tidy["term"].str[-4:].astype(int)

    return tidy


cc_scorecard = pd.read_csv("../data_files/cc_scorecard.csv")
uc_scorecard = pd.read_csv("../data_files/uc_scorecard.csv")

cc2uc_major = pd.read_csv("../data_files/cc2uc_major.csv")
cc2uc_eth = pd.read_csv("../data_files/cc2uc_3status_eth.csv")
cc2uc_gnd = pd.read_csv("../data_files/cc2uc_3status_gnd.csv")

fin_aid = load_fin_aid_clean()
citizen = load_citizenship_clean()

dist = pd.read_csv("../data_files/cc_uc_drive_distances.csv")


In [259]:
# Normalize columns
def normalize_cols(df):
    df = df.copy()
    df.columns = [c.strip().lower() for c in df.columns]
    return df

cc_scorecard = normalize_cols(cc_scorecard)
uc_scorecard = normalize_cols(uc_scorecard)
cc2uc_major = normalize_cols(cc2uc_major)
cc2uc_eth = normalize_cols(cc2uc_eth)
cc2uc_gnd = normalize_cols(cc2uc_gnd)
fin_aid = normalize_cols(fin_aid)
citizen = normalize_cols(citizen)
dist = normalize_cols(dist)

In [260]:
# UC & year normalization
uc_name_mapping = {
    "UCB" : "Berkeley",
    "UCD" : "Davis",
    "UCLA" : "Los Angeles",
    "UCSD" : "San Diego",
    "UCI" : "Irvine",
    "UCSB" : "Santa Barbara",
    "UCSC" : "Santa Cruz",
    "UCM" : "Merced",
    "UCR" : "Riverside"
}
uc_scorecard = uc_scorecard.rename(columns={"school.name": "uc"})
uc_scorecard['uc'] = uc_scorecard['uc'].apply(lambda x: x[25:])
cc2uc_major["uc"] = cc2uc_major["uc"].map(uc_name_mapping)
dist = dist.rename(columns={"uc_name" : "uc"})
dist['uc'] = dist['uc'].apply(lambda x: x[25:])

cc2uc_major['year'] = cc2uc_major['year'].apply(lambda x: int(x[:4]))
cc2uc_eth['year'] = cc2uc_eth['year'].apply(lambda x: int(x[:4]))
cc2uc_gnd['year'] = cc2uc_gnd['year'].apply(lambda x: int(x[:4]))

In [261]:
# CC normalization
def normalize_cc_name(name: str) -> str:
    if pd.isna(name):
        return ""
    s = str(name).upper().strip()

    s = re.sub(r"\s+", " ", s)

    s = s.replace("MT.", "MOUNT").replace("MT ", "MOUNT ")
    s = s.replace(" ST.", " SAINT").replace(" ST ", " SAINT ")
    s = re.sub(r"\b(TOTAL|DISTRICT)\b", "", s)
    s = re.sub(r"^(CITY\s+COLLEGE|COLLEGE)\s+OF\s+", "", s)
    s = re.sub(r"\bCOMMUNITY COLLEGE\b", "", s)
    s = re.sub(r"\bCOLLEGE\b", "", s)
    s = re.sub(r"^THE\s+", "", s)
    s = re.sub(r"[^A-Z0-9 ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()

    return s

cc2uc_major = cc2uc_major.rename(columns={"cc": "school"})
cc2uc_major["cc"] = cc2uc_major["school"].apply(normalize_cc_name)
cc2uc_eth["cc"] = cc2uc_eth["school"].apply(normalize_cc_name)
cc2uc_gnd["cc"] = cc2uc_gnd["school"].apply(normalize_cc_name)
cc_scorecard["cc"] = cc_scorecard["school.name"].apply(normalize_cc_name)
fin_aid["cc"] = fin_aid["level1"].apply(normalize_cc_name)
citizen["cc"] = citizen["level1"].apply(normalize_cc_name)
dist["cc"] = dist["cc_name"].apply(normalize_cc_name)


In [272]:
#Change type to numeric
def num_normalize(col):
    s = (
        col.astype(str)
        .str.replace(",", "", regex=False)
        .str.replace(r"[^\d\.\-]", "", regex=True)
    )
    return pd.to_numeric(s, errors="coerce")

cc2uc_major = cc2uc_major.rename(columns={"enrolls": "num"})
cc2uc_major["num"] = num_normalize(cc2uc_major["num"])
cc2uc_eth["num"] = num_normalize(cc2uc_eth["num"])
cc2uc_gnd["num"] = num_normalize(cc2uc_gnd["num"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cc2uc_gnd["num"] = num_normalize(cc2uc_gnd["num"])


In [264]:
# Choose 2012-2023 data
YEAR_MIN, YEAR_MAX = 2012, 2023

def year_filter(df, col="year"):
    return df[(df[col] >= YEAR_MIN) & (df[col] <= YEAR_MAX)]

cc_scorecard = year_filter(cc_scorecard)
uc_scorecard = year_filter(uc_scorecard)
cc2uc_major = year_filter(cc2uc_major)
cc2uc_eth = year_filter(cc2uc_eth)
cc2uc_gnd = year_filter(cc2uc_gnd)
fin_aid = year_filter(fin_aid)
citizen = year_filter(citizen)


# Benchmark

In [276]:
cc2uc_app = cc2uc_gnd[(cc2uc_gnd['gender'] == 'All') & (cc2uc_gnd['count'] == 'App')].drop(columns=["gender",'count'])
cc2uc_adm = cc2uc_gnd[(cc2uc_gnd['gender'] == 'All') & (cc2uc_gnd['count'] == 'Adm')].drop(columns=["gender",'count'])
cc2uc_benchmark = pd.merge(cc2uc_app,cc2uc_adm,on=["year","cc","city","county","school",'uc'],suffixes=['_app','_adm'])
cc2uc_benchmark['adm_rate'] = cc2uc_benchmark['num_adm']/cc2uc_benchmark['num_app']

# Feature Selection

In [266]:
data_dic = pd.read_csv("../data_files/Data Dictionary - STA221.csv")
data_dic = normalize_cols(data_dic)

mask_q1 = (data_dic["q1"] == True) & (data_dic["source"].str.contains("scorecard"))
q1_scorecard_cols = data_dic.loc[mask_q1, "feature"].tolist()


In [267]:
cc_feat = cc_scorecard[["year", "cc"] + q1_scorecard_cols].copy()
uc_feat = uc_scorecard[["year", "uc"] + q1_scorecard_cols].copy()

cc_feat = cc_feat.add_prefix("cc_")
uc_feat = uc_feat.add_prefix("uc_")

cc_feat = cc_feat.rename(columns={"cc_year": "year", "cc_cc": "cc"})
uc_feat = uc_feat.rename(columns={"uc_year": "year", "uc_uc": "uc"})


In [268]:
cc2uc_major_agg = (
    cc2uc_major
    .groupby(["year", "cc", "uc"])
    .agg(
        total_enrolls=("enrolls", "sum"),
        n_fields=("field", "nunique"),
        n_majors=("major", "nunique")
    )
    .reset_index()
)


In [None]:
# 数值列（注意列名是 normalize 之后的小写）
fin_num_cols = []
for cand in ["aid amount", "student count", "award count"]:
    if cand in fin.columns:
        fin_num_cols.append(cand)

fin_cc = (
    fin.groupby(["year", "cc"], as_index=False)[fin_num_cols]
       .sum()
)

# 重命名一下，避免和别的冲突
fin_rename = {c: f"fin_{c.replace(' ', '_')}_sum" for c in fin_num_cols}
fin_cc = fin_cc.rename(columns=fin_rename)

# ===========================
# 3. 处理 citizen：pivot 成各 citizenship 类型的占比特征
# ===========================

cit = citizen.copy()

# 确认列存在
if "student count (%)" in cit.columns:
    # 用占比做特征，按 level2（citizenship 类型）展开
    cit_pivot = (
        cit.pivot_table(
            index=["year", "cc"],
            columns="level2",
            values="student count (%)",
            aggfunc="first"
        )
        .reset_index()
    )
    cit_pivot.columns.name = None

    # 给列加前缀，且把空格等清理一下
    new_cols = []
    for c in cit_pivot.columns:
        if c in ["year", "cc"]:
            new_cols.append(c)
        else:
            cname = str(c).lower().strip()
            cname = re.sub(r"\s+", "_", cname)
            new_cols.append(f"citizen_pct_{cname}")
    cit_pivot.columns = new_cols
else:
    # 如果没有占比列，就用 student count 做总量占位
    cit_num_cols = []
    if "student count" in cit.columns:
        cit_num_cols.append("student count")
    cit_pivot = (
        cit.groupby(["year", "cc"], as_index=False)[cit_num_cols]
           .sum()
    )
    cit_pivot = cit_pivot.rename(columns={"student count": "citizen_student_count_sum"})

# ===========================
# 4. 处理距离：挑一个距离列出来
# ===========================

dist_df = dist.copy()
dist_cols = dist_df.columns
dist_val_candidates = [c for c in dist_cols if ("mile" in c) or ("dist" in c)]
if len(dist_val_candidates) == 0:
    print("提示：dist 里没找到包含 'mile' 或 'dist' 的列，暂时不加距离特征。")
    dist_ccuc = None
else:
    dist_val_col = dist_val_candidates[0]
    # 转成数值
    dist_df[dist_val_col] = num_normalize(dist_df[dist_val_col])
    dist_ccuc = dist_df[["cc", "uc", dist_val_col]].rename(columns={dist_val_col: "drive_miles"})
