# Cleaning and Normalizing Data Mart Files

The files downloaded directly from the [CCC Data Mart](https://datamart.cccco.edu/) are not formatted well. The purpose of this set of scripts is to clean up those files of unhelpful formatting and to normalize for analysis.

For the associated cleaning and denormalization of the EOPS files, see `ucd_sta_221_project/data_files/cc_eops/clean_eops.ipynb`.

In [27]:
import pandas as pd

In [None]:
top_code = {
    "math": "170100",
    "math": "150100"
}

In [None]:
path = "ucd_sta_221_project/data_files"
file = f"datamart_math_success_retention_{top_code.get("math")}"

math = pd.read_csv(
    f"{path}/{file}.csv",
    header=None
)
math = math.drop(index=1).reset_index(drop=True) # Contains the word "Transferable"
math.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,231,232,233,234,235,236,237,238,239,240
0,,Fall 2011,Fall 2011,Fall 2011,Fall 2011,Fall 2011,Fall 2012,Fall 2012,Fall 2012,Fall 2012,...,Winter 2022,Winter 2022,Winter 2022,Winter 2022,Winter 2022,Winter 2023,Winter 2023,Winter 2023,Winter 2023,Winter 2023
1,,Enrollment Count,Retention Count,Success Count,Retention Rate,Success Rate,Enrollment Count,Retention Count,Success Count,Retention Rate,...,Enrollment Count,Retention Count,Success Count,Retention Rate,Success Rate,Enrollment Count,Retention Count,Success Count,Retention Rate,Success Rate
2,Alameda,464,323,262,69.61%,56.47%,465,331,268,71.18%,...,,,,,,,,,,
3,Allan Hancock,1313,1033,795,78.67%,60.55%,1200,1002,795,83.50%,...,,,,,,,,,,
4,American River,4260,3456,2942,81.13%,69.06%,3976,3223,2677,81.06%,...,,,,,,,,,,


In [None]:
new_cols = [f"{a} {b}".strip('_') for a, b in zip(math.iloc[0], math.iloc[1])]
math.columns = new_cols
math.rename(columns={"nan nan": "College"}, inplace=True)
math = math.drop(index=[0, 1]).reset_index(drop=True)  # drop the first two rows

math.head()

Unnamed: 0,College,Fall 2011 Enrollment Count,Fall 2011 Retention Count,Fall 2011 Success Count,Fall 2011 Retention Rate,Fall 2011 Success Rate,Fall 2012 Enrollment Count,Fall 2012 Retention Count,Fall 2012 Success Count,Fall 2012 Retention Rate,...,Winter 2022 Enrollment Count,Winter 2022 Retention Count,Winter 2022 Success Count,Winter 2022 Retention Rate,Winter 2022 Success Rate,Winter 2023 Enrollment Count,Winter 2023 Retention Count,Winter 2023 Success Count,Winter 2023 Retention Rate,Winter 2023 Success Rate
0,Alameda,464,323,262,69.61%,56.47%,465,331,268,71.18%,...,,,,,,,,,,
1,Allan Hancock,1313,1033,795,78.67%,60.55%,1200,1002,795,83.50%,...,,,,,,,,,,
2,American River,4260,3456,2942,81.13%,69.06%,3976,3223,2677,81.06%,...,,,,,,,,,,
3,Antelope Valley,1829,1479,1146,80.86%,62.66%,1696,1458,1188,85.97%,...,,,,,,,,,,
4,Bakersfield,1282,1018,791,79.41%,61.70%,1573,1243,995,79.02%,...,,,,,,,,,,


In [None]:
# keep only headcounts. We will re-compute rates later
math.columns = math.columns.str.strip()
cols_to_keep = ["College"]
cols_to_keep += [c for c in math.columns if c.endswith("Count")]
math = math.loc[:, cols_to_keep].copy()

math.head()

Unnamed: 0,College,Fall 2011 Enrollment Count,Fall 2011 Retention Count,Fall 2011 Success Count,Fall 2012 Enrollment Count,Fall 2012 Retention Count,Fall 2012 Success Count,Fall 2013 Enrollment Count,Fall 2013 Retention Count,Fall 2013 Success Count,...,Winter 2020 Success Count,Winter 2021 Enrollment Count,Winter 2021 Retention Count,Winter 2021 Success Count,Winter 2022 Enrollment Count,Winter 2022 Retention Count,Winter 2022 Success Count,Winter 2023 Enrollment Count,Winter 2023 Retention Count,Winter 2023 Success Count
0,Alameda,464,323,262,465,331,268,650,473,390,...,,,,,,,,,,
1,Allan Hancock,1313,1033,795,1200,1002,795,1275,1061,846,...,,,,,,,,,,
2,American River,4260,3456,2942,3976,3223,2677,3937,3213,2591,...,,,,,,,,,,
3,Antelope Valley,1829,1479,1146,1696,1458,1188,1803,1540,1246,...,258.0,,,,,,,,,
4,Bakersfield,1282,1018,791,1573,1243,995,1847,1536,1180,...,,,,,,,,,,


In [None]:
cols_to_cast = [c for c in math.columns if c != "College"]

math[cols_to_cast] = math[cols_to_cast].replace({",": ""}, regex=True)

math[cols_to_cast] = (
    math[cols_to_cast]
    .apply(pd.to_numeric, errors="coerce")
    .astype("Int64")
)

math.head()

Unnamed: 0,College,Fall 2011 Enrollment Count,Fall 2011 Retention Count,Fall 2011 Success Count,Fall 2012 Enrollment Count,Fall 2012 Retention Count,Fall 2012 Success Count,Fall 2013 Enrollment Count,Fall 2013 Retention Count,Fall 2013 Success Count,...,Winter 2020 Success Count,Winter 2021 Enrollment Count,Winter 2021 Retention Count,Winter 2021 Success Count,Winter 2022 Enrollment Count,Winter 2022 Retention Count,Winter 2022 Success Count,Winter 2023 Enrollment Count,Winter 2023 Retention Count,Winter 2023 Success Count
0,Alameda,464,323,262,465,331,268,650,473,390,...,,,,,,,,,,
1,Allan Hancock,1313,1033,795,1200,1002,795,1275,1061,846,...,,,,,,,,,,
2,American River,4260,3456,2942,3976,3223,2677,3937,3213,2591,...,,,,,,,,,,
3,Antelope Valley,1829,1479,1146,1696,1458,1188,1803,1540,1246,...,258.0,,,,,,,,,
4,Bakersfield,1282,1018,791,1573,1243,995,1847,1536,1180,...,,,,,,,,,,


In [None]:
# Winter intersession terms are very short and not representative;
# drop these columns
math.columns = math.columns.str.strip()
cols_to_drop = [c for c in math.columns if "Winter" in c]
math = math.drop(columns=cols_to_drop)

math.head()

Unnamed: 0,College,Fall 2011 Enrollment Count,Fall 2011 Retention Count,Fall 2011 Success Count,Fall 2012 Enrollment Count,Fall 2012 Retention Count,Fall 2012 Success Count,Fall 2013 Enrollment Count,Fall 2013 Retention Count,Fall 2013 Success Count,...,Summer 2020 Success Count,Summer 2021 Enrollment Count,Summer 2021 Retention Count,Summer 2021 Success Count,Summer 2022 Enrollment Count,Summer 2022 Retention Count,Summer 2022 Success Count,Summer 2023 Enrollment Count,Summer 2023 Retention Count,Summer 2023 Success Count
0,Alameda,464,323,262,465,331,268,650,473,390,...,162,129,102,91,97,75,65,89,69,62
1,Allan Hancock,1313,1033,795,1200,1002,795,1275,1061,846,...,567,564,473,385,527,467,393,548,482,419
2,American River,4260,3456,2942,3976,3223,2677,3937,3213,2591,...,946,1069,905,765,826,684,590,843,740,611
3,Antelope Valley,1829,1479,1146,1696,1458,1188,1803,1540,1246,...,655,792,715,606,704,644,547,864,792,628
4,Bakersfield,1282,1018,791,1573,1243,995,1847,1536,1180,...,667,742,633,469,751,666,516,1157,955,715


In [34]:
def get_academic_year_terms(year: str):
    """
    Given a year as a string (e.g., "2023"), return a list of academic terms
    for that academic year in the format:
    - Fall {previous_year}
    - Spring {year}
    - Summer {year}

    :param year: The year for which to generate academic terms.
    :return: A list of academic terms for the specified year.
    """
    return [
        f"Fall {int(year) - 1}",
        f"Spring {year}",
        f"Summer {year}"
    ]

get_academic_year_terms("2012")

['Fall 2011', 'Spring 2012', 'Summer 2012']

In [35]:
def build_rates_by_year(year: str, rate: str, df: pd.DataFrame):
    """ 
    Given a reporting year and a rate type (e.g., "Success" or "Retention"),
    computes the rate by summing over each associated term in the reporting year
    and then dividing the appropriate counts.

    :param year: The reporting year as a string (e.g., "2012").
    :param rate: The type of rate to compute ("Success" or "Retention").
    """

    terms = get_academic_year_terms(year)
    if rate == "Success":
        num_cols = [f"{term} Success Count" for term in terms]
        denom_cols = [f"{term} Enrollment Count" for term in terms]
    elif rate == "Retention":
        num_cols = [f"{term} Retention Count" for term in terms]
        denom_cols = [f"{term} Enrollment Count" for term in terms]
    else:
        raise ValueError("Rate must be either 'Success' or 'Retention'.")

    df[f"{year} {rate} Rate"] = (
        df[num_cols].sum(axis=1) / df[denom_cols].sum(axis=1)
    )

    df.drop(columns = num_cols, inplace=True)

    return df

In [36]:
def build_rates(years: list[str], df: pd.DataFrame):
    """
    Given a list of reporting years, computes the Success and Retention rates
    for each combination and updates the DataFrame in place.

    :param years: A list of reporting years as strings (e.g., ["2012", "2013"]).
    :param df: The DataFrame containing the data.
    """
    for year in years:
        for rate in ["Success", "Retention"]:
            df = build_rates_by_year(year, rate, df)
            
    
    # Drop Enrollment Count columns after rates are computed
    df.columns = df.columns.str.strip()
    cols_to_drop = [
        c for c in df.columns if "Enrollment Count" in c
    ]
    df.drop(columns=cols_to_drop, inplace=True)

In [None]:
build_rates(range(2012, 2024), math)

math.head()

Unnamed: 0,College,2012 Success Rate,2012 Retention Rate,2013 Success Rate,2013 Retention Rate,2014 Success Rate,2014 Retention Rate,2015 Success Rate,2015 Retention Rate,2016 Success Rate,...,2019 Success Rate,2019 Retention Rate,2020 Success Rate,2020 Retention Rate,2021 Success Rate,2021 Retention Rate,2022 Success Rate,2022 Retention Rate,2023 Success Rate,2023 Retention Rate
0,Alameda,0.556738,0.673759,0.579952,0.697693,0.617666,0.744681,0.588507,0.731836,0.589623,...,0.612516,0.76043,0.714495,0.808931,0.602451,0.738598,0.554054,0.716216,0.517705,0.676435
1,Allan Hancock,0.614829,0.788564,0.666779,0.82432,0.686919,0.82672,0.666211,0.822435,0.66753,...,0.662391,0.844209,0.688406,0.849235,0.605065,0.81953,0.585925,0.791209,0.622407,0.835001
2,American River,0.679089,0.808152,0.675178,0.8125,0.668139,0.822333,0.675852,0.823005,0.68929,...,0.694654,0.844522,0.738849,0.849817,0.655296,0.795549,0.660118,0.80303,0.674212,0.831892
3,Antelope Valley,0.648424,0.825977,0.683125,0.849835,0.672477,0.845734,0.694424,0.865466,0.679802,...,0.701185,0.866013,0.693808,0.87242,0.675724,0.878712,0.651409,0.877873,0.668482,0.897059
4,Bakersfield,0.623323,0.783282,0.652508,0.816114,0.665065,0.840461,0.63551,0.812449,0.60456,...,0.585225,0.830923,0.58897,0.814287,0.510219,0.77478,0.549902,0.788783,0.563361,0.800748


In [None]:
# reshape math from wide (one column per year + metric) to long with one row
# per (College, Year)
cols = [c for c in math.columns if c != "College"]

long = math.melt(
    id_vars="College",
    value_vars=cols,
    var_name="year_metric",
    value_name="value"
)

# split the "YYYY Metric" column into Year and Metric
long = long.assign(
    Year=long["year_metric"].str.extract(r"^(\d{4})")[0].astype(int),
    Metric=long["year_metric"].str.replace(r"^\d{4}\s+", "", regex=True).str.strip()
).drop(columns="year_metric")

# pivot so each row is College, Year, Success Rate, Retention Rate
math_yearly = (
    long.pivot_table(
        index=["Year", "College"],
        columns="Metric",
        values="value"
    ).reset_index()
)

math_yearly.columns.name = None
math_yearly = math_yearly.sort_values(["Year", "College"]).reset_index(drop=True)

math_yearly.head()

Unnamed: 0,Year,College,Retention Rate,Success Rate
0,2012,Alameda,0.673759,0.556738
1,2012,Allan Hancock,0.788564,0.614829
2,2012,American River,0.808152,0.679089
3,2012,Antelope Valley,0.825977,0.648424
4,2012,Bakersfield,0.783282,0.623323


In [None]:
math_yearly.columns = [
    "Year",
    "College",
    "Math Retention Rate",
    "Math Success Rate"
]

In [None]:
newpath = "ucd_sta_221_project/ml/q2_persistence_after_transfer/processed_data"
math_yearly.to_csv(f"{newpath}/{file}_normalized.csv", index=False)