## Convert CSVs to Parquet

To make working with files locally easier, convert all the files we received from OSSE to a parquet format.

In [1]:
from pathlib import Path

import pandas as pd

from suso.utils import here

In [2]:
DATA_DIR = here("data")

In [3]:
def read_attendance_data(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="latin1")

    # Standardize column names
    df.rename(
        columns={
            "homeless_indicator": "HomelessIndicatorOSSE",
            "at_risk_indicator": "AtRiskIndicator",
            "overage_indicator": "OverageIndicator",
            "Withdrawal_date": "fixed_withdrawal_date",
            "farms": "FarmsStatusSISdesc",
            "local_id": "StudentLocalID",
            "grade": "GradeLevel",
            "race_ethnicity": "race",
            "schoolcode": "Enr_SchoolID",
            "schoolname": "Enr_SchoolName",
            "Attendance StatusCode Description": "Attendance_Status_Desc",
            "school_cep_status": "School_CEP_Status",
            "economically_disadvantaged": "Economically_Disadvantaged",
            "english_learner": "LEPIndicator",
        },
        inplace=True,
    )

    # Drop unused columns
    for col in ["SISID"]:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    # Convert booleans
    for col in [
        "HomelessIndicatorOSSE",
        "AtRiskIndicator",
        "OverageIndicator",
        "LEPIndicator",
        "Economically_Disadvantaged",
        "School_CEP_Status",
    ]:
        if col in df.columns:
            # Some columns only available in one year or the other
            df[col] = df[col].fillna("NO")
            df[col] = df[col].str.strip().str.upper() == "YES"

    # Subset to relevant grades
    df = df[~df["GradeLevel"].isin(["09", "10", "11", "12]"])]

    # Convert dates
    df["AttendanceDate"] = pd.to_datetime(df["AttendanceDate"])
    df["enroll_date"] = pd.to_datetime(df["enroll_date"])
    df["fixed_withdrawal_date"] = pd.to_datetime(df["fixed_withdrawal_date"])

    # StudentLocalID should be treated as a string due to Charter School ids
    df["StudentLocalID"] = df["StudentLocalID"].astype(str)

    return df

In [4]:
df = read_attendance_data(DATA_DIR / "SY1718_DCPS_Attendance_Data_cleaned.csv.gz")
df.to_parquet(DATA_DIR / "SY1718_DCPS_Attendance_Data_cleaned.parquet")

In [5]:
df = read_attendance_data(DATA_DIR / "SY1718_Charter_Sector_Attendance_cleaned.csv.gz")
df.to_parquet(DATA_DIR / "SY1718_Charter_Sector_Attendance_cleaned.parquet")

In [6]:
df = read_attendance_data(DATA_DIR / "redacted_SY1617_DCPS_Sector_Attendance.csv.gz")
df.to_parquet(DATA_DIR / "redacted_SY1617_DCPS_Sector_Attendance.parquet")

  df = read_attendance_data(DATA_DIR / "redacted_SY1617_DCPS_Sector_Attendance.csv.gz")


In [7]:
df = read_attendance_data(DATA_DIR / "redacted_SY1617_Charter_Sector_Attendance.csv.gz")
df.to_parquet(DATA_DIR / "redacted_SY1617_Charter_Sector_Attendance.parquet")

  df = read_attendance_data(DATA_DIR / "redacted_SY1617_Charter_Sector_Attendance.csv.gz")
