# Calculate descriptives regarding who's in the sample/balance

In [None]:
import warnings

import numpy as np
import pandas as pd
from plotnine import *

from suso.utils import PICKLE_PROTOCOL, here

warnings.filterwarnings("ignore")

In [None]:
DATA_DIR = here("data")
OUTPUT_DIR = here("output")

# Step 0: define postgres functions

For now, just copied postgres functions from other script before making into a module

In [None]:
def data_startofyear_outcomes(data: pd.DataFrame) -> pd.DataFrame:
    """
    Get the first entry for each studenty by day
    """
    return data.sort_values(by=["usi", "AttendanceDate"]).drop_duplicates(
        ["usi"], keep="first"
    )


def data_endofyear_outcomes(data: pd.DataFrame) -> pd.DataFrame:
    """
    Get the last entry for each studenty by day
    """
    return data.sort_values(by=["usi", "AttendanceDate"]).drop_duplicates(
        ["usi"], keep="last"
    )


## plotting themes
standard_background_rotatex = theme(
    panel_background=element_blank(),
    panel_grid_major_y=element_blank(),
    axis_text_x=element_text(color="black", angle=90, hjust=1, size=12),
    axis_text_y=element_text(color="black", size=12),
    legend_text=element_text(color="black", size=10),
    legend_title=element_text(color="black", size=12),
    axis_title=element_text(size=12),
    strip_text_x=element_text(size=12),
    legend_background=element_blank(),
    legend_key=element_blank(),
    panel_grid_major=element_blank(),
    panel_grid_minor=element_blank(),
    axis_ticks=element_blank(),
)

facet_background = theme(
    panel_background=element_blank(),
    panel_grid_major_y=element_blank(),
    axis_text_x=element_blank(),
    axis_text_y=element_text(color="black", size=6),
    legend_text=element_text(color="black", size=10),
    legend_title=element_text(color="black", size=12),
    axis_title=element_text(size=6),
    strip_text_x=element_text(size=6),
    legend_background=element_blank(),
    legend_key=element_blank(),
    panel_grid_major=element_blank(),
    panel_grid_minor=element_blank(),
    axis_ticks=element_blank(),
)

# 1: Descriptives 

Run the helper script to generate student attributes: 6helper_summarize_studentattributes.sql

## 1.1: Who's in the sample?



In [None]:
## read in student attributes and merge
student_attributes_public = data_startofyear_outcomes(
    pd.read_parquet(
        DATA_DIR / "SY1718_DCPS_Attendance_Data_cleaned.parquet",
        columns=[
            "usi",
            "AttendanceDate",
            "gender",
            "race",
            "Enr_SchoolID",
            "Enr_SchoolName",
            "SEDSSPEDLEvel",
            "enroll_date",
            "fixed_withdrawal_date",
            "LEPIndicator",
            "FarmsStatusSISdesc",
            "School_CEP_Status",
            "Economically_Disadvantaged",
            "HomelessIndicatorOSSE",
            "OverageIndicator",
            "AtRiskIndicator",
        ],
    )
).drop(columns=["AttendanceDate"])

student_attributes_charter = data_startofyear_outcomes(
    pd.read_parquet(
        DATA_DIR / "SY1718_Charter_Sector_Attendance_cleaned.parquet",
        columns=[
            "usi",
            "AttendanceDate",
            "gender",
            "race",
            "Enr_SchoolID",
            "Enr_SchoolName",
            "SEDSSPEDLEvel",
            "enroll_date",
            "fixed_withdrawal_date",
            "LEPIndicator",
            "FarmsStatusSISdesc",
            "School_CEP_Status",
            "Economically_Disadvantaged",
            "HomelessIndicatorOSSE",
            "OverageIndicator",
            "AtRiskIndicator",
        ],
    )
).drop(columns=["AttendanceDate"])

In [None]:
student_attributes_all = pd.concat(
    [student_attributes_public, student_attributes_charter]
).drop_duplicates()

### 1.1.1 Create indicator for which students individually in sample or not

Defining in sample as referred even if no valid treatment status

In [None]:
## read in lookup table
lookup_suso_attendance = pd.read_pickle(DATA_DIR / "suso_osse_lookup.pkl")
df_suso = pd.read_csv(DATA_DIR / "df_suso_merged.csv")

## suso ids of those with valid treatment status
susoids_ref_and_treat = list(
    set(lookup_suso_attendance.suso_id).intersection(
        set(df_suso.student_id[df_suso.is_treatment.notnull()])
    )
)
susoids_ref_or_treat = list(
    set(lookup_suso_attendance.suso_id).intersection(set(df_suso.student_id))
)

usis_ref_and_treat = lookup_suso_attendance.usi[
    lookup_suso_attendance.suso_id.isin(susoids_ref_and_treat)
]
usis_ref_or_treat = lookup_suso_attendance.usi[
    lookup_suso_attendance.suso_id.isin(susoids_ref_or_treat)
]

In [None]:
## create flags for in suso or not
student_attributes_all["in_sample"] = np.where(
    student_attributes_all.usi.isin(usis_ref_or_treat), 1, 0
)

### 1.1.2 Create indicator for which students are in schools that are present in the sample 
### (conservative measure of which schools are eligible for SUSO)

In [None]:
schools_wsuso_students = student_attributes_all.Enr_SchoolName[
    student_attributes_all.usi.isin(usis_ref_or_treat)
].tolist()
print(str(len(set(schools_wsuso_students))) + " schools, out of ")
print(
    str(len(student_attributes_all.Enr_SchoolName.unique()))
    + " schools total have students in the sample"
)

In [None]:
student_attributes_all["in_sample_school"] = np.where(
    student_attributes_all.Enr_SchoolName.isin(schools_wsuso_students), 1, 0
)

In [None]:
student_attributes_all["sample_and_schoolstatus"] = np.where(
    (student_attributes_all.in_sample == 1)
    & (student_attributes_all.in_sample_school == 1),
    "In sample and\nin SUSO-eligible school",
    np.where(
        (student_attributes_all.in_sample == 0)
        & (student_attributes_all.in_sample_school == 1),
        "Not in sample but\nin SUSO-eligible school",
        "Not in sample and not\nin SUSO-eligible school",
    ),
)

### 1.1.3: Clean student attributes

In [None]:
## code races into broader cats using dictionary
race_remap_dictionary = {
    "B": "Black",
    "BL": "Black",
    "HI": "Hispanic",
    "H": "Hispanic",
    "WH": "White",
    "W": "White",
}
student_attributes_all["race_nows"] = student_attributes_all.race.str.strip()
student_attributes_all[
    "race_broadercat_init"
] = student_attributes_all.race_nows.replace(race_remap_dictionary, inplace=False)
student_attributes_all["race_broadercat"] = np.where(
    ~student_attributes_all.race_broadercat_init.isin(["White", "Black", "Hispanic"]),
    "Other",
    student_attributes_all.race_broadercat_init,
)

In [None]:
## iterate over attributes, find
## mean, and plot comparisons between suso
## and not suso

student_attributes_toexamine = [
    "LEPIndicator",
    "FarmsStatusSISdesc",
    "School_CEP_Status",
    "Economically_Disadvantaged",
    "HomelessIndicatorOSSE",
    "OverageIndicator",
    "AtRiskIndicator",
    "gender",
    "race_broadercat",
]

student_attributes_categorical = pd.get_dummies(
    student_attributes_all[student_attributes_toexamine]
)
student_attributes_categorical_names = student_attributes_categorical.columns.tolist()

In [None]:
## Dummies doesn't create a new column when there's just a True/False value
## So we will explicitly create it
student_attributes_categorical.columns = [
    name if name not in student_attributes_toexamine else f"{name}_True"
    for name in student_attributes_categorical_names
]

## merge back dummies with original
student_attributes_withcat = pd.concat(
    [student_attributes_categorical, student_attributes_all], axis=1
)

## within-suso comparison
for_summary_morecat = student_attributes_categorical_names + ["sample_and_schoolstatus"]

# 2: Find mean of student attributes for students in suso schools but not sample, in suso schools + in sample, or not in suso schools

## 2.1 Calculate

In [None]:
mean_whetherinsuso_morecat = (
    student_attributes_withcat[for_summary_morecat]
    .groupby("sample_and_schoolstatus")
    .mean()
    .T.reset_index()
)

In [None]:
newcolnames = [
    "attribute_name" if col == "index" else col for col in mean_whetherinsuso_morecat
]

In [None]:
mean_whetherinsuso_morecat.columns = newcolnames

In [None]:
mean_whetherinsuso_clean_morecat = mean_whetherinsuso_morecat.loc[
    ~mean_whetherinsuso_morecat.attribute_name.str.contains("Missing")
].copy()
mean_whetherinsuso_clean_long_morecat = pd.melt(
    mean_whetherinsuso_clean_morecat, id_vars="attribute_name"
)

In [None]:
## clean up attribute name
mean_whetherinsuso_clean_long_morecat["attribute_name_clean"] = (
    mean_whetherinsuso_clean_long_morecat.attribute_name.str.replace("OSSE", "")
    .str.replace("race\\_broadercat\\_", "")
    .str.replace("gender\\_", "")
    .str.replace("True", "Yes")
    .str.replace("False", "No")
)

In [None]:
dem_vars = [
    "F",
    "M",
    "Black",
    "White",
    "Hispanic",
    "Other",
    "OverageIndicator_True",
    "OverageIndicator_False",
    "LEPIndicator_False",
    "LEPIndicator_True",
]
other_vars = [
    var
    for var in mean_whetherinsuso_clean_long_morecat.attribute_name_clean.unique()
    if var not in dem_vars
]
all_vars = dem_vars + other_vars
whichgroups = mean_whetherinsuso_clean_long_morecat.variable.unique()
whichgroups_order = [
    "In sample and\nin SUSO-eligible school",
    "Not in sample but\nin SUSO-eligible school",
    "Not in sample and not\nin SUSO-eligible school",
]

In [None]:
## order levels of category
mean_whetherinsuso_clean_long_morecat["attributes_ordered"] = pd.Categorical(
    mean_whetherinsuso_clean_long_morecat.attribute_name_clean,
    categories=all_vars,
    ordered=True,
)
mean_whetherinsuso_clean_long_morecat["whichgroup_ordered"] = pd.Categorical(
    mean_whetherinsuso_clean_long_morecat.variable,
    categories=whichgroups_order,
    ordered=True,
)

## 2.2: plot

In [None]:
att_studycat = (
    ggplot(
        mean_whetherinsuso_clean_long_morecat[
            mean_whetherinsuso_clean_long_morecat.attributes_ordered.isin(dem_vars)
        ],
        aes(x="factor(attributes_ordered)", y="value", fill="whichgroup_ordered"),
    )
    + geom_bar(stat="identity", position="dodge", alpha=0.8)
    + coord_flip()
    + ylab("Percentage of students\nin category")
    + xlab("Attribute")
    + standard_background_rotatex
    + scale_fill_manual(values=("firebrick", "springgreen", "gray"))
    + theme(legend_position=(0.75, 0.7))
    + labs(fill="Category")
)

ggsave(att_studycat, OUTPUT_DIR / "attributes_studycat.pdf", width=12, height=8)

moreatt_studycat = (
    ggplot(
        mean_whetherinsuso_clean_long_morecat[
            mean_whetherinsuso_clean_long_morecat.attributes_ordered.isin(other_vars)
        ],
        aes(x="factor(attributes_ordered)", y="value", fill="whichgroup_ordered"),
    )
    + geom_bar(stat="identity", position="dodge", alpha=0.8)
    + coord_flip()
    + ylab("Percentage of students\nin category")
    + xlab("Attribute")
    + standard_background_rotatex
    + scale_fill_manual(values=("firebrick", "springgreen", "gray"))
    + theme(legend_position=(0.75, 1), legend_text=element_text(size=8))
    + labs(fill="Category")
)

ggsave(
    moreatt_studycat,
    OUTPUT_DIR / "moreattributes_studycat.pdf",
    width=12,
    height=8,
)

# 3: Check balance between the treatment and control group

### Balance by school

#### Engagement analytic sample

In [None]:
## full data
balance_fullsample = pd.crosstab(
    df_suso.school_name, df_suso.is_treatment
).reset_index()
balance_fullsample_long = balance_fullsample.melt(id_vars=["school_name"])
balance_fullsample_long["group"] = np.where(
    balance_fullsample_long.is_treatment == False, "Control", "Treatment"
)
bal_byschool = (
    ggplot(
        balance_fullsample_long, aes(x="factor(group)", y="value", fill="factor(group)")
    )
    + geom_bar(stat="identity", position="dodge", alpha=0.5)
    + facet_background
    + scale_y_continuous(breaks=(0, 25, 50))
    + xlab("")
    + labs(fill="")
    + theme(legend_position="bottom")
    + scale_fill_manual(values=("#444444", "#2B4888"))
    + facet_wrap("~school_name")
)

ggsave(bal_byschool, OUTPUT_DIR / "bal_byschool.pdf", device="pdf", width=12, height=8)

#### Attendance analytic sample

In [None]:
attendance_both_clean = pd.read_parquet(DATA_DIR / "attendance_both_clean.parquet")
attendance_endofyear_sy1718 = data_endofyear_outcomes(data=attendance_both_clean)

# The student attributes (which is anchored to the beginning of year) also has an
# enrolled school. Since all other attributes are BOY, we keep that one
attendance_endofyear_sy1718.drop(columns=["Enr_SchoolName"], inplace=True)


## merge with student attributes and subset
## each to the usi's for which we know
## treatment status (so not just referred)
attendance_endofyear_sy1718_withattributes = pd.merge(
    attendance_endofyear_sy1718[
        attendance_endofyear_sy1718.usi.isin(usis_ref_and_treat)
    ],
    student_attributes_all,
    on="usi",
    how="left",
).drop_duplicates(subset=["usi"])


## merge with tre
attendance_eoy_wsusoid = pd.merge(
    attendance_endofyear_sy1718_withattributes,
    lookup_suso_attendance[["suso_id", "name_dob", "usi", "type_of_match"]],
    on="usi",
    how="left",
)
df_suso_tomerge = df_suso[
    [
        "is_treatment",
        "status",
        "status_datetime",
        "student_id",
        "site_id",
        "site_name",
        "referral_source",
        "school_name",
    ]
].copy()
attendance_eoy_wsuso = pd.merge(
    attendance_eoy_wsusoid,
    df_suso_tomerge,
    left_on="suso_id",
    right_on="student_id",
    how="left",
).drop_duplicates(subset="suso_id", keep="first")

attendance_eoy_wsuso.to_pickle(
    DATA_DIR / "attendance_eoy_wsuso.pkl", protocol=PICKLE_PROTOCOL
)

In [None]:
## summarize attendance analytic sample
attendance_eoy_wsuso = pd.read_pickle(DATA_DIR / "attendance_eoy_wsuso.pkl")
balance_attendancesample = pd.crosstab(
    attendance_eoy_wsuso.school_name, attendance_eoy_wsuso.is_treatment
).reset_index()
balance_attendancesample_long = balance_attendancesample.melt(id_vars=["school_name"])
balance_attendancesample_long["group"] = np.where(
    balance_attendancesample_long.is_treatment == False, "Control", "Treatment"
)
balance_attendancesample_long.head()

att_balbyschool = (
    ggplot(
        balance_attendancesample_long,
        aes(x="factor(group)", y="value", fill="factor(group)"),
    )
    + geom_bar(stat="identity", position="dodge", alpha=0.5)
    + facet_background
    + scale_y_continuous(breaks=(0, 25, 50))
    + xlab("")
    + labs(fill="")
    + theme(legend_position="bottom")
    + scale_fill_manual(values=("#444444", "#2B4888"))
    + facet_wrap("~school_name")
)

ggsave(att_balbyschool, OUTPUT_DIR / "attbal_byschool.pdf", width=12, height=8)

In [None]:
balance_attendancesample.columns = ["school_name", "Control", "Treatment"]
balance_attendancesample["total"] = (
    balance_attendancesample["Control"] + balance_attendancesample["Treatment"]
)
balance_attendancesample["prop_treat"] = (
    balance_attendancesample.Treatment / balance_attendancesample.total
)
balance_attendancesample["prop_control"] = (
    balance_attendancesample.Control / balance_attendancesample.total
)
balance_attendancesample["prop_treat_minus_control"] = (
    balance_attendancesample.prop_treat - balance_attendancesample.prop_control
)


balance_school_scatter = (
    ggplot(balance_attendancesample, aes(x="total", y="prop_treat_minus_control"))
    + geom_point()
    + standard_background_rotatex
    + xlab("Total students randomized from school")
    + ylab("Proportion in treatment group minus\nproportion in control group")
    + geom_hline(yintercept=0, linetype="dashed", color="red")
)

ggsave(
    balance_school_scatter,
    OUTPUT_DIR / "balance_school_scatter.pdf",
    width=12,
    height=8,
)

### Balance by demographic attributes

In [None]:
## merge in treatment status indicator
for_merge = student_attributes_categorical_names + ["usi"]
student_attributes_withcat_withtx = pd.merge(
    student_attributes_withcat[for_merge],
    attendance_eoy_wsuso[["usi", "is_treatment"]],
    on="usi",
    how="inner",
)

In [None]:
vars_compare = [
    var
    for var in student_attributes_withcat_withtx.columns
    if "id" not in var and "usi" not in var
]
mean_txstatus = (
    student_attributes_withcat_withtx[vars_compare]
    .groupby("is_treatment")
    .mean()
    .T.reset_index()
)
mean_txstatus.columns = ["attribute", "Control", "Treatment"]
mean_txstatus["txprop_minus_controlprop"] = (
    mean_txstatus.Treatment - mean_txstatus.Control
)
mean_txstatus_cleaned = pd.merge(
    mean_txstatus,
    mean_whetherinsuso_clean_long_morecat[
        ["attribute_name", "attributes_ordered"]
    ].drop_duplicates(),
    right_on="attribute_name",
    left_on="attribute",
    how="inner",
)

## merge clean attributes


## plot by attribute
bal_dem = (
    ggplot(
        mean_txstatus_cleaned,
        aes(x="factor(attributes_ordered)", y="txprop_minus_controlprop"),
    )
    + geom_bar(stat="identity", fill="yellow", alpha=0.5, color="black")
    + xlab("Attribute")
    + ylab(
        "Treatment proportion\nminus control group proportion\n(positive = treatment group higher)"
    )
    + standard_background_rotatex
    + coord_flip()
)

ggsave(bal_dem, OUTPUT_DIR / "bal_dem.pdf", width=12, height=8)

# 3. Write files used in next script



In [None]:
attendance_eoy_wsuso.to_pickle(
    DATA_DIR / "attendance_eoy_wsuso.pkl", protocol=PICKLE_PROTOCOL
)
student_attributes_withcat_withtx.to_pickle(
    DATA_DIR / "student_attributes_withcat_withtx.pkl", protocol=PICKLE_PROTOCOL
)