In [1]:
from pathlib import Path
from typing import List, Optional, Union

import pandas as pd

from suso.utils import here

In [2]:
DATA_DIR = here("data")

In [3]:
lookup_df = pd.read_pickle(DATA_DIR / "suso_osse_lookup.pkl")

In [4]:
def attendance_outcomes(
    path: Union[List[Path], Path],
    unexcused_names: List[str],
    excused_names: List[str],
    present_names: Optional[List[str]] = None,
    non_school_day_names: Optional[List[str]] = None,
) -> pd.DataFrame:

    if isinstance(path, Path):
        df = pd.read_parquet(
            path,
            columns=[
                "usi",
                "AttendanceDate",
                "StudentLocalID",
                "Attendance_Status_Desc",
                "Weekday_Name",
                "Enr_SchoolName",
            ],
        )
    else:
        df = pd.concat(
            [
                pd.read_parquet(
                    pth,
                    columns=[
                        "usi",
                        "AttendanceDate",
                        "StudentLocalID",
                        "Attendance_Status_Desc",
                        "Weekday_Name",
                        "Enr_SchoolName",
                    ],
                )
                for pth in path
            ]
        )

    print("The attendance status codes:")
    print(df["Attendance_Status_Desc"].value_counts(dropna=False))

    df["is_unexcused"] = df["Attendance_Status_Desc"].isin(unexcused_names)
    df["is_excusedorunexcused"] = df["Attendance_Status_Desc"].isin(
        unexcused_names + excused_names
    )
    if present_names:
        df["is_schooldays"] = df["Attendance_Status_Desc"].isin(
            unexcused_names + excused_names + present_names
        )
    else:
        df["is_schooldays"] = ~df["Attendance_Status_Desc"].isin(non_school_day_names)

    df = df.sort_values(by=["usi", "AttendanceDate"])

    cum_df = df.groupby("usi")[
        ["is_unexcused", "is_excusedorunexcused", "is_schooldays"]
    ].cumsum()
    cum_df.rename(
        columns={col: f"total_{col[3:]}" for col in cum_df.columns}, inplace=True
    )

    df = pd.concat([df, cum_df], axis=1)

    df["truant_indicator"] = (df["total_unexcused"] >= 10).astype(int)
    df["chronicabsent_ratio"] = df["total_excusedorunexcused"] / df["total_schooldays"]
    df["chronicabsent_indicator"] = (df["chronicabsent_ratio"] >= 0.1).astype(int)

    df = df.drop_duplicates()

    df = df[df["usi"].isin(lookup_df["usi"])].copy()

    return df

In [5]:
dcps_1718 = attendance_outcomes(
    DATA_DIR / "SY1718_DCPS_Attendance_Data_cleaned.parquet",
    unexcused_names=["Absent Fully Unexcused"],
    excused_names=["Absent Excused Suspension"],
    present_names=["Present Full"],
)

dcps_1718.to_parquet(DATA_DIR / "dcps_sy1718_attendanceoutcomes_suso.parquet")

The attendance status codes:
Present Full                 6581135
Non-School Day               4810100
Absent Fully Unexcused        372919
Absent Excused Suspension     231405
Name: Attendance_Status_Desc, dtype: int64




In [6]:
charter_1718 = attendance_outcomes(
    DATA_DIR / "SY1718_Charter_Sector_Attendance_cleaned.parquet",
    unexcused_names=["Absent Fully Unexcused", "Absent Partial Unexcused"],
    excused_names=["Absent Fully Excused", "Absent Partial Excused"],
    non_school_day_names=["Non-School Day"],
)

charter_1718.to_parquet(DATA_DIR / "charter_sy1718_attendanceoutcomes_suso.parquet")

The attendance status codes:
Present Full                    5280224
Non-School Day                  4355220
Present Partial Unexcused        443820
Absent Fully Excused             324395
Absent Fully Unexcused           312470
Present Partial Excused           72559
Not Submitted                     12258
Absent Partial Unexcused          10009
Absent Partial Excused             8458
Present Partial                    1102
Absent - Adult Ed No Session         13
Unknown                              11
Not Available                         1
Name: Attendance_Status_Desc, dtype: int64


In [7]:
dcps_1617 = attendance_outcomes(
    DATA_DIR / "redacted_SY1617_DCPS_Sector_Attendance.parquet",
    unexcused_names=["Absent Fully Unexcused"],
    excused_names=["Absent Excused Suspension", "Absent Fully Excused"],
    non_school_day_names=["Non-School Day"],
)

dcps_1617.to_parquet(DATA_DIR / "dcps_sy1617_attendanceoutcomes_suso.parquet")

The attendance status codes:
Present Full              6709148
Non-School Day            4774355
Absent Fully Unexcused     399373
Absent Fully Excused       244320
Name: Attendance_Status_Desc, dtype: int64


In [8]:
charter_1617 = attendance_outcomes(
    DATA_DIR / "redacted_SY1617_Charter_Sector_Attendance.parquet",
    unexcused_names=["Absent Fully Unexcused", "Absent Partial Unexcused"],
    excused_names=[
        "Absent Fully Excused",
        "Absent Partial Excused",
        "Absent - Out of School Suspension",
    ],
    non_school_day_names=["Non-School Day"],
)
charter_1617.to_parquet(DATA_DIR / "charter_sy1617_attendanceoutcomes_suso.parquet")

The attendance status codes:
Present Full                         5177923
Non-School Day                       4098827
Present Partial Unexcused             409734
Absent Fully Excused                  282251
Absent Fully Unexcused                279474
Present Partial Excused                73261
Not Submitted                          15371
Absent Partial Unexcused               10332
Absent - Out of School Suspension       8354
Absent Partial Excused                  7899
Present - In School Suspension          1448
Absent - Adult Ed No Session             510
Absent Excused Suspension                  5
Not Available                              3
Name: Attendance_Status_Desc, dtype: int64


In [9]:
attendance_both_clean = attendance_outcomes(
    [
        DATA_DIR / "SY1718_DCPS_Attendance_Data_cleaned.parquet",
        DATA_DIR / "SY1718_Charter_Sector_Attendance_cleaned.parquet",
    ],
    unexcused_names=["Absent Fully Unexcused", "Absent Partial Unexcused"],
    excused_names=[
        "Absent Excused Suspension",
        "Absent Partial Excused",
        "Absent Fully Excused",
    ],
    non_school_day_names=["Non-School Day"],
)
attendance_both_clean.to_parquet(DATA_DIR / "attendance_both_clean.parquet")

The attendance status codes:
Present Full                    11861359
Non-School Day                   9165320
Absent Fully Unexcused            685389
Present Partial Unexcused         443820
Absent Fully Excused              324395
Absent Excused Suspension         231405
Present Partial Excused            72559
Not Submitted                      12258
Absent Partial Unexcused           10009
Absent Partial Excused              8458
Present Partial                     1102
Absent - Adult Ed No Session          13
Unknown                               11
Not Available                          1
Name: Attendance_Status_Desc, dtype: int64
