In [1]:
import warnings

import numpy as np
import pandas as pd

## graphing
from plotnine import *

from suso import abtesting
from suso.plotting_themes import standard_background
from suso.utils import here

warnings.filterwarnings("ignore")

In [2]:
DATA_DIR = here("data")
OUTPUT_DIR = here("output")

In [3]:
def create_plot_ofcertainty(
    probs_alldraws,
    single_draw_value,
    scale_y,
    direction,
    outcome,
    adjust_x=-0.05,
    treatment_color="#2B4888",
    control_color="#444444",
):
    prob_df = pd.DataFrame({"certainty": probs_alldraws})
    plot = (
        ggplot(prob_df, aes(x="certainty"))
        + geom_density(fill="white", color=treatment_color)
        + standard_background
        + ylab("Density of draws")
        + scale_fill_gradient(low="white", high="gray")
        + scale_x_continuous(breaks=np.arange(0, 1, 0.1))
        + annotate(
            "text",
            x=single_draw_value + adjust_x,
            y=(scale_y / 4) * 3.8,
            label="Observed degree\nof certainty:\n{}".format(
                round(single_draw_value, 3)
            ),
        )
        + xlab(
            "Degree of certainty that treatment students\nhad {direction} {outcome} than control students".format(
                direction=direction, outcome=outcome
            )
        )
    )
    return plot

# 1. Load data

In [4]:
attendance_analytic = pd.read_pickle(DATA_DIR / "attendance_readyforAB.pkl")
attendance_analytic.head()

treat_data = attendance_analytic[attendance_analytic.is_treatment == True].copy()
control_data = attendance_analytic[attendance_analytic.is_treatment == False].copy()

# 2. A/B testing of binary year-end attendance outcomes

## 2.1 Truancy

### 2.1.1 One draw version

In [5]:
outcome_varname = "truant_indicator"

In [6]:
successes_treatment = treat_data.usi[treat_data[outcome_varname] == 1].nunique()
failures_treatment = treat_data.usi[treat_data[outcome_varname] == 0].nunique()
successes_control = control_data.usi[control_data[outcome_varname] == 1].nunique()
failures_control = control_data.usi[control_data[outcome_varname] == 0].nunique()

truancy_prob_txgreaterthancontrol = abtesting.degree_of_certainty(
    successes_control, failures_control, successes_treatment, failures_treatment
)
truancy_prob_txlessthancontrol = 1 - truancy_prob_txgreaterthancontrol

outcome = "truant"
direction = "LOWER"
print(f"Count {outcome} in treatment group {successes_treatment}")
print(
    f"Percent {outcome} in treatment group {successes_treatment / (successes_treatment + failures_treatment) * 100}"
)
print(f"Count {outcome} in control group {successes_control}")
print(
    "Percent {} in control group ".format(outcome)
    + str(successes_control / (successes_control + failures_control) * 100)
)
print("--------------------------------")
print(
    f"A/B test single draw prob. that treatment group had {direction} rate than control group: {truancy_prob_txlessthancontrol}"
)

Count truant in treatment group 294
Percent truant in treatment group 48.595041322314046
Count truant in control group 302
Percent truant in control group 50.92748735244519
--------------------------------
A/B test single draw prob. that treatment group had LOWER rate than control group: 0.7900483866563637


In [7]:
num_treatment = successes_treatment + failures_treatment
num_control = successes_control + failures_control
base_rate = successes_control / num_control
treatment_rate = successes_treatment / num_treatment

(
    truancy_prob_txlessthancontrol_perc,
    truancy_prob_txlessthancontrol_alldraws,
    truancy_posterior,
) = abtesting.degree_of_certainty_draws_forwriteup(
    base_rate=base_rate,
    treatment_rate=treatment_rate,
    successes_control=successes_control,
    successes_treatment=successes_treatment,
    failures_control=failures_control,
    failures_treatment=failures_treatment,
    num_control=num_control,
    num_treatment=num_treatment,
    num_draws=10000,
    tx_greater=False,
)

In [8]:
certainty_truancy = create_plot_ofcertainty(
    probs_alldraws=truancy_prob_txlessthancontrol_alldraws,
    single_draw_value=truancy_prob_txlessthancontrol,
    scale_y=2,
    direction="lower",
    outcome="year-end truancy",
)
ggsave(
    certainty_truancy,
    OUTPUT_DIR / "certainty_truancy.pdf",
    width=12,
    height=8,
)

## 2.2 Chronic absenteeism

### 2.2.1 One draw version

In [9]:
outcome_varname = "chronicabsent_indicator"

In [10]:
successes_treatment = len(treat_data.usi[treat_data[outcome_varname] == 1].unique())
failures_treatment = len(treat_data.usi[treat_data[outcome_varname] == 0].unique())
successes_control = len(control_data.usi[control_data[outcome_varname] == 1].unique())
failures_control = len(control_data.usi[control_data[outcome_varname] == 0].unique())

ca_prob_txgreaterthancontrol = abtesting.degree_of_certainty(
    successes_control, failures_control, successes_treatment, failures_treatment
)

outcome = "chronically absent"
direction = "LOWER"
print("Count {} in treatment group ".format(outcome) + str(successes_treatment))
print(
    "Percent {} in treatment group ".format(outcome)
    + str(successes_treatment / (successes_treatment + failures_treatment) * 100)
)
print("Count {} in control group ".format(outcome) + str(successes_control))
print(
    "Percent {} in control group ".format(outcome)
    + str(successes_control / (successes_control + failures_control) * 100)
)
print("--------------------------------")
print(
    "A/B test single draw prob. that treatment group had {} rate than control group: ".format(
        direction
    )
    + str(1 - ca_prob_txgreaterthancontrol)
)

Count chronically absent in treatment group 255
Percent chronically absent in treatment group 42.14876033057851
Count chronically absent in control group 240
Percent chronically absent in control group 40.472175379426645
--------------------------------
A/B test single draw prob. that treatment group had LOWER rate than control group: 0.2780830594767396


### 2.2.2 Many draw version

In [11]:
num_treatment = successes_treatment + failures_treatment
num_control = successes_control + failures_control
base_rate = successes_control / num_control
treatment_rate = successes_treatment / num_treatment

(
    ca_prob_txlessthancontrol_perc,
    ca_prob_txlessthancontrol_alldraws,
    ca_posterior,
) = abtesting.degree_of_certainty_draws_forwriteup(
    base_rate=base_rate,
    treatment_rate=treatment_rate,
    successes_control=successes_control,
    successes_treatment=successes_treatment,
    failures_control=failures_control,
    failures_treatment=failures_treatment,
    num_control=num_control,
    num_treatment=num_treatment,
    num_draws=10000,
    tx_greater=False,
)

In [12]:
certainty_ca = create_plot_ofcertainty(
    probs_alldraws=ca_prob_txlessthancontrol_alldraws,
    single_draw_value=(1 - ca_prob_txgreaterthancontrol),
    scale_y=2,
    direction="lower",
    outcome="year-end chronic absenteeism",
)

# 3. A/B testing: count outcomes 

Repeat same process for count outcomes

## 3.1 unexcused in two week period: median delivery date

### 3.1.1: single draws

In [13]:
##
outcome_varname = "diff_unexcused_mediandelivery"
exposure_varname = "diff_schooldays_mediandelivery"
events_treatment = np.sum(treat_data[outcome_varname])
exposure_treatment = np.sum(treat_data[exposure_varname])
events_control = np.sum(control_data[outcome_varname])
exposure_control = np.sum(control_data[exposure_varname])


## single draw test
unex_mediandelivery_prob_txgreaterthancontrol = abtesting.degree_of_certainty_counts(
    events_control=events_control,
    exposure_control=exposure_control,
    events_tx=events_treatment,
    exposure_tx=exposure_treatment,
)

unex_mediandelivery_prob_txlessthancontrol = (
    1 - unex_mediandelivery_prob_txgreaterthancontrol
)

outcome = "of unexcused absences in two weeks after letter (median delivery day)"
direction = "lower"
print("Count {} in treatment group ".format(outcome) + str(events_treatment))
print(
    "Rate {} in treatment group ".format(outcome)
    + str(events_treatment / (exposure_treatment))
)
print("Count {} in control group ".format(outcome) + str(events_control))
print(
    "Rate {} in control group ".format(outcome)
    + str(events_control / (exposure_control))
)
print("--------------------------------")
print(
    "A/B test single draw prob. that treatment group had {} count than control group: ".format(
        direction
    )
    + str(unex_mediandelivery_prob_txlessthancontrol)
)

Count of unexcused absences in two weeks after letter (median delivery day) in treatment group 245.0
Rate of unexcused absences in two weeks after letter (median delivery day) in treatment group 0.04888268156424581
Count of unexcused absences in two weeks after letter (median delivery day) in control group 303.0
Rate of unexcused absences in two weeks after letter (median delivery day) in control group 0.06047904191616767
--------------------------------
A/B test single draw prob. that treatment group had lower count than control group: 0.9927751595539442


### 3.1.2 multiple draws

In [14]:
(
    unex_mediandelivery_perc,
    unex_mediandelivery_alldraws,
    unex_mediandelivery_posterior,
) = abtesting.degree_of_certainty_count_draws_forwriteup(
    control_events=events_control,
    treatment_events=events_treatment,
    control_exposure=exposure_control,
    treatment_exposure=exposure_treatment,
    num_draws=10000,
    tx_greater=False,
)

In [15]:
unex_mediandelivery_alldraws

array([0.999986  , 0.99838651, 0.99859219, ..., 0.9879524 , 0.99901596,
       0.98883458])

In [16]:
certainty_unex_itt = create_plot_ofcertainty(
    probs_alldraws=unex_mediandelivery_alldraws,
    single_draw_value=unex_mediandelivery_prob_txlessthancontrol,
    scale_y=40,
    direction="lower",
    outcome="count of unexcused absencesover two weeks\n(assume 7 day delivery timeline)\n",
    adjust_x=-0.15,
)

ggsave(
    certainty_unex_itt,
    OUTPUT_DIR / "certainty_unex_itt.pdf",
    width=12,
    height=8,
)

## 3.2 unexcused in two week period: observed delivery

In [17]:
outcome_varname = "diff_unexcused_observeddelivery"
exposure_varname = "diff_schooldays_observeddelivery"
events_treatment = np.sum(treat_data[outcome_varname])
exposure_treatment = np.sum(treat_data[exposure_varname])
events_control = np.sum(control_data[outcome_varname])
exposure_control = np.sum(control_data[exposure_varname])


## single draw test
unex_obsdelivery_prob_txgreaterthancontrol = abtesting.degree_of_certainty_counts(
    events_control=events_control,
    exposure_control=exposure_control,
    events_tx=events_treatment,
    exposure_tx=exposure_treatment,
)

unex_obsdelivery_prob_txlessthancontrol = 1 - unex_obsdelivery_prob_txgreaterthancontrol

outcome = "of unexcused absences in two weeks after letter (observed delivery day)"
direction = "lower"
print("Count {} in treatment group ".format(outcome) + str(events_treatment))
print("Exposure {} in treatment group ".format(outcome) + str(exposure_treatment))
print(
    "Rate {} in treatment group ".format(outcome)
    + str(events_treatment / (exposure_treatment))
)
print("Count {} in control group ".format(outcome) + str(events_control))
print("Exposure {} in control group ".format(outcome) + str(exposure_control))
print(
    "Rate {} in control group ".format(outcome)
    + str(events_control / (exposure_control))
)
print("--------------------------------")
print(
    "A/B test single draw prob. that treatment group had {} count than control group: ".format(
        direction
    )
    + str(unex_obsdelivery_prob_txlessthancontrol)
)

Count of unexcused absences in two weeks after letter (observed delivery day) in treatment group 252.0
Exposure of unexcused absences in two weeks after letter (observed delivery day) in treatment group 5011.0
Rate of unexcused absences in two weeks after letter (observed delivery day) in treatment group 0.05028936340051886
Count of unexcused absences in two weeks after letter (observed delivery day) in control group 318.0
Exposure of unexcused absences in two weeks after letter (observed delivery day) in control group 5093.0
Rate of unexcused absences in two weeks after letter (observed delivery day) in control group 0.062438641272334575
--------------------------------
A/B test single draw prob. that treatment group had lower count than control group: 0.994404041775581


In [18]:
(
    unex_obsdelivery_perc,
    unex_obsdelivery_alldraws,
    unex_obsdelivery_posterior,
) = abtesting.degree_of_certainty_count_draws_forwriteup(
    control_events=events_control,
    treatment_events=events_treatment,
    control_exposure=exposure_control,
    treatment_exposure=exposure_treatment,
    num_draws=10000,
    tx_greater=False,
)

certainty_unex_observed = create_plot_ofcertainty(
    probs_alldraws=unex_obsdelivery_alldraws,
    single_draw_value=unex_obsdelivery_prob_txlessthancontrol,
    scale_y=20,
    direction="lower",
    outcome="count of unexcused absences over two weeks\n(using observed delivery date)\n",
    adjust_x=-0.15,
)

ggsave(
    certainty_unex_observed,
    OUTPUT_DIR / "certainty_unex_observed.pdf",
    width=12,
    height=8,
)

## 3.3 unexcused and excused in two-week period: median delivery date


In [19]:
outcome_varname = "diff_excusedorunexcused_mediandelivery"
exposure_varname = "diff_schooldays_mediandelivery"
events_treatment = np.sum(treat_data[outcome_varname])
exposure_treatment = np.sum(treat_data[exposure_varname])
events_control = np.sum(control_data[outcome_varname])
exposure_control = np.sum(control_data[exposure_varname])


## single draw test
unexandex_mediandelivery_prob_txgreaterthancontrol = (
    abtesting.degree_of_certainty_counts(
        events_control=events_control,
        exposure_control=exposure_control,
        events_tx=events_treatment,
        exposure_tx=exposure_treatment,
    )
)

unexandex_mediandelivery_prob_txlessthancontrol = (
    1 - unexandex_mediandelivery_prob_txgreaterthancontrol
)

outcome = "of unexcused absences in two weeks after letter (observed delivery day)"
direction = "lower"
print("Count {} in treatment group ".format(outcome) + str(events_treatment))
print(
    "Rate {} in treatment group ".format(outcome)
    + str(events_treatment / (exposure_treatment))
)
print("Count {} in control group ".format(outcome) + str(events_control))
print(
    "Rate {} in control group ".format(outcome)
    + str(events_control / (exposure_control))
)
print("--------------------------------")
print(
    "A/B test single draw prob. that treatment group had {} count than control group: ".format(
        direction
    )
    + str(unexandex_mediandelivery_prob_txlessthancontrol)
)

Count of unexcused absences in two weeks after letter (observed delivery day) in treatment group 503.0
Rate of unexcused absences in two weeks after letter (observed delivery day) in treatment group 0.10035913806863528
Count of unexcused absences in two weeks after letter (observed delivery day) in control group 520.0
Rate of unexcused absences in two weeks after letter (observed delivery day) in control group 0.10379241516966067
--------------------------------
A/B test single draw prob. that treatment group had lower count than control group: 0.693951139330429


In [20]:
(
    unexandex_mediandelivery_perc,
    unexandex_mediandelivery_alldraws,
    unexandex_mediandelivery_posterior,
) = abtesting.degree_of_certainty_count_draws_forwriteup(
    control_events=events_control,
    treatment_events=events_treatment,
    control_exposure=exposure_control,
    treatment_exposure=exposure_treatment,
    num_draws=10000,
    tx_greater=False,
)

In [21]:
certainty_both_itt = create_plot_ofcertainty(
    probs_alldraws=unexandex_mediandelivery_alldraws,
    single_draw_value=unexandex_mediandelivery_prob_txlessthancontrol,
    scale_y=1.5,
    direction="lower",
    outcome="count of unexcused OR excused absences over two weeks\n(assume 7 day delivery timeline)\n",
    adjust_x=-0.15,
)

ggsave(
    certainty_both_itt,
    OUTPUT_DIR / "certainty_both_itt.pdf",
    width=12,
    height=8,
)

## 3.4 unexcused and excused in two-week period: observed delivery

In [22]:
outcome_varname = "diff_excusedorunexcused_observeddelivery"
exposure_varname = "diff_schooldays_observeddelivery"
events_treatment = np.sum(treat_data[outcome_varname])
exposure_treatment = np.sum(treat_data[exposure_varname])
events_control = np.sum(control_data[outcome_varname])
exposure_control = np.sum(control_data[exposure_varname])


## single draw test
unexandex_obsdelivery_prob_txgreaterthancontrol = abtesting.degree_of_certainty_counts(
    events_control=events_control,
    exposure_control=exposure_control,
    events_tx=events_treatment,
    exposure_tx=exposure_treatment,
)

unexandex_obsdelivery_prob_txlessthancontrol = (
    1 - unexandex_obsdelivery_prob_txgreaterthancontrol
)

outcome = "of unexcused absences in two weeks after letter (observed delivery day)"
direction = "lower"
print("Count {} in treatment group ".format(outcome) + str(events_treatment))
print("Exposure {} in treatment group ".format(outcome) + str(exposure_treatment))
print(
    "Rate {} in treatment group ".format(outcome)
    + str(events_treatment / (exposure_treatment))
)
print("Count {} in control group ".format(outcome) + str(events_control))
print("Exposure {} in control group ".format(outcome) + str(exposure_control))
print(
    "Rate {} in control group ".format(outcome)
    + str(events_control / (exposure_control))
)
print("--------------------------------")
print(
    "A/B test single draw prob. that treatment group had {} count than control group: ".format(
        direction
    )
    + str(unexandex_obsdelivery_prob_txlessthancontrol)
)

Count of unexcused absences in two weeks after letter (observed delivery day) in treatment group 522.0
Exposure of unexcused absences in two weeks after letter (observed delivery day) in treatment group 5011.0
Rate of unexcused absences in two weeks after letter (observed delivery day) in treatment group 0.10417082418678907
Count of unexcused absences in two weeks after letter (observed delivery day) in control group 524.0
Exposure of unexcused absences in two weeks after letter (observed delivery day) in control group 5093.0
Rate of unexcused absences in two weeks after letter (observed delivery day) in control group 0.1028863145493815
--------------------------------
A/B test single draw prob. that treatment group had lower count than control group: 0.4085241066284664


In [23]:
(
    unexandex_obsdelivery_perc,
    unexandex_obsdelivery_alldraws,
    unexandex_obsdelivery_posterior,
) = abtesting.degree_of_certainty_count_draws_forwriteup(
    control_events=events_control,
    treatment_events=events_treatment,
    control_exposure=exposure_control,
    treatment_exposure=exposure_treatment,
    num_draws=10000,
    tx_greater=False,
)

In [24]:
certainty_both_obs = create_plot_ofcertainty(
    probs_alldraws=unexandex_mediandelivery_alldraws,
    single_draw_value=unexandex_mediandelivery_prob_txlessthancontrol,
    scale_y=1.5,
    direction="lower",
    outcome="count of unexcused OR excused absences over two weeks\n(using observed delivery date)\n",
    adjust_x=-0.15,
)

ggsave(
    certainty_both_obs,
    OUTPUT_DIR / "certainty_both_obs.pdf",
    width=12,
    height=8,
)

# 4. Engagement in 14 days (rerunning here to standardize)

In [25]:
%%capture

engage_df = pd.read_csv(DATA_DIR / "engage_forAB.csv")
engage_df["engaged_binary"] = np.where(engage_df.engaged_14days == False, 0, 1)
treat_data = engage_df.loc[engage_df.is_treatment == True].copy()
control_data = engage_df.loc[engage_df.is_treatment == False].copy()
outcome_varname = "engaged_binary"
engage_df.head()

In [26]:
successes_treatment = len(
    treat_data.student_id[treat_data[outcome_varname] == 1].unique()
)
failures_treatment = len(
    treat_data.student_id[treat_data[outcome_varname] == 0].unique()
)
successes_control = len(
    control_data.student_id[control_data[outcome_varname] == 1].unique()
)
failures_control = len(
    control_data.student_id[control_data[outcome_varname] == 0].unique()
)

engaged_prob_txgreaterthancontrol = abtesting.degree_of_certainty(
    successes_control, failures_control, successes_treatment, failures_treatment
)
engaged_prob_txlessthancontrol = 1 - truancy_prob_txgreaterthancontrol

outcome = "engaged"
direction = "LOWER"
print("Count {} in treatment group ".format(outcome) + str(successes_treatment))
print(
    "Percent {} in treatment group ".format(outcome)
    + str(successes_treatment / (successes_treatment + failures_treatment) * 100)
)
print("Count {} in control group ".format(outcome) + str(successes_control))
print(
    "Percent {} in control group ".format(outcome)
    + str(successes_control / (successes_control + failures_control) * 100)
)
print("--------------------------------")
print(
    "A/B test single draw prob. that treatment group had {} rate than control group: ".format(
        direction
    )
    + str(engaged_prob_txgreaterthancontrol)
)

Count engaged in treatment group 38
Percent engaged in treatment group 5.855161787365177
Count engaged in control group 48
Percent engaged in control group 7.68
--------------------------------
A/B test single draw prob. that treatment group had LOWER rate than control group: 0.09801971018619943


In [27]:
## many draw version
num_treatment = successes_treatment + failures_treatment
num_control = successes_control + failures_control
base_rate = successes_control / num_control
treatment_rate = successes_treatment / num_treatment

(
    engaged_prob_txlessthancontrol_perc,
    engaged_prob_txlessthancontrol_alldraws,
    engaged_posterior,
) = abtesting.degree_of_certainty_draws_forwriteup(
    base_rate=base_rate,
    treatment_rate=treatment_rate,
    successes_control=successes_control,
    successes_treatment=successes_treatment,
    failures_control=failures_control,
    failures_treatment=failures_treatment,
    num_control=num_control,
    num_treatment=num_treatment,
    num_draws=10000,
    tx_greater=False,
)

certainty_engaged = create_plot_ofcertainty(
    probs_alldraws=engaged_prob_txlessthancontrol_alldraws,
    single_draw_value=engaged_prob_txlessthancontrol,
    scale_y=1.5,
    direction="lower",
    outcome="engagement rate",
    adjust_x=-0.15,
)

certainty_engaged

ggsave(
    certainty_engaged,
    OUTPUT_DIR / "certainty_engaged.pdf",
    width=12,
    height=8,
)

# 4. Export posteriors to plot in R

In [28]:
## create df with all posteriors
posterior_arrays = [
    truancy_posterior,
    ca_posterior,
    unex_obsdelivery_posterior,
    unex_mediandelivery_posterior,
    unexandex_obsdelivery_posterior,
    unexandex_mediandelivery_posterior,
    engaged_posterior,
]

In [29]:
posterior_array_df_list = [pd.DataFrame(x) for x in posterior_arrays]
posterior_array_df = pd.concat(posterior_array_df_list, axis=1)
posterior_array_df.columns = [
    "truancy",
    "ca",
    "unex_obs",
    "unex_median",
    "unexandex_obs",
    "unexandex_median",
    "engagement",
]
posterior_array_df.head()

Unnamed: 0,truancy,ca,unex_obs,unex_median,unexandex_obs,unexandex_median,engagement
0,-0.195448,-0.134666,-0.435855,-0.413925,-0.318055,-0.431035,-0.109575
1,-0.181748,-0.133224,-0.423348,-0.413063,-0.302915,-0.383864,-0.097568
2,-0.180047,-0.125408,-0.401599,-0.394315,-0.299215,-0.365054,-0.09543
3,-0.177654,-0.125007,-0.398299,-0.392198,-0.261834,-0.353687,-0.089629
4,-0.168538,-0.122768,-0.398067,-0.391808,-0.257685,-0.342976,-0.088669


In [30]:
posterior_array_df.to_csv(
    DATA_DIR / "attendanceandengageoutcomes_posteriors_toplot.csv", index=False
)