# Evaluating the SUSO outreach letters

This script pulls data from several sources and analyzes them using Kevin Wilson's bayesian AB testing script. The question it seeks to answer is: do the automated letters sent to a random assignment of SUSO-eligible families increase the rate at which those families engage with SUSO CBOs. 

For more information on the project, see: [github link]

In [1]:
import sys

sys.path.append("..")

import os
from datetime import datetime

import numpy as np
import pandas as pd

# load data

We're going to read in a bunch of data before merging it.  

In [2]:
def read_df(df_name):
    data_source = os.path.join("..", "data", df_name + ".csv")
    df = pd.read_csv(data_source)
    return df


students = read_df("students")
randomizer = read_df("randomizer")
jobs = read_df("jobs")
mailings = read_df("mailings")
eto_data = read_df("eto_data")

# merge all the data about the experiment
merge students & randomizer data

In [3]:
columns = ["id", "guardian_firstname", "guardian_lastname"]
df = pd.merge(
    students[columns], randomizer, how="left", left_on="id", right_on="student_id"
)
df.drop(["student_id", "created_at"], axis=1, inplace=True)
df = df.rename(columns={"id": "student_id"})
df.student_id.value_counts().head()

22527    1
23090    1
23081    1
23082    1
18987    1
Name: student_id, dtype: int64

In [4]:
df.is_treatment.value_counts()

True     683
False    653
Name: is_treatment, dtype: int64

dedupe add jobs data

In [5]:
jobs_deduped = jobs.sort_values(["student_id", "created_at"]).drop_duplicates(
    "student_id", keep="last"
)
df = pd.merge(df, jobs_deduped, how="left", on="student_id")
df.drop(["created_at"], axis=1, inplace=True)
df = df.rename(columns={"id": "job_id"})

print(
    "Number of treated students without an associated job_id: "
    + str(df[df.is_treatment == True].job_id.isnull().sum())
)
print("Number of rows:", len(df))

Number of treated students without an associated job_id: 19
Number of rows: 1492


Add Mailings data.  

In [6]:
mailings_deduped = mailings.sort_values(["job_id", "status_datetime"]).drop_duplicates(
    "job_id", keep="last"
)
df = pd.merge(df, mailings_deduped, how="left", left_on="job_id", right_on="job_id")
df.drop(["id", "created_at"], axis=1, inplace=True)
df.head()
status = df
print(
    "Number of treated students without an associated job_id: "
    + str(df[df.is_treatment == True].job_id.isnull().sum())
)
print("Number of rows: " + str(df.shape[0]))
df.student_id.value_counts().head()

Number of treated students without an associated job_id: 19
Number of rows: 1492


22527    1
23090    1
23081    1
23082    1
18987    1
Name: student_id, dtype: int64

In [7]:
print("Number unassigned:", df.is_treatment.isnull().sum())
df.is_treatment.value_counts()

Number unassigned: 156


True     683
False    653
Name: is_treatment, dtype: int64

# merge data about the experiment with ETO data
Now that we have all the data about the experiment in a single data frame, we can combine it with data from ETO. 

In [8]:
eto_deduped = eto_data.sort_values("referral_date").drop_duplicates(
    ["CLID"], keep="last"
)
df = df.drop(
    ["guardian_firstname", "guardian_lastname"], axis=1
)  # bc it will be present in both and is duplicative with id
df = df.merge(
    eto_deduped, how="left", left_on="student_id", right_on="CLID", copy=False
)

In [9]:
df = df[
    df.current_referral_status.notnull()
]  ## Restrict sample to students with a current referral status
df["engaged"] = df.current_referral_status == "Engaged"
df["fullname"] = df["guardian_firstname"] + " " + df["guardian_lastname"]
print(len(df), df.engaged.mean())

1426 0.105189340813


In [10]:
df.to_csv("../data/data_for_analysis.csv", encoding="utf-8", index=False)