# Evaluating the SUSO outreach letters

This notebook gathers data about the automated letters sent to a random assignment of SUSO-eligible families increase the rate at which those families engage with SUSO CBOs. It does not merge or otherwise analyze the data.  

In [1]:
import sys

sys.path.append("..")

import copy
import os
from datetime import datetime

import numpy as np
import pandas as pd
import pyodbc
import yaml

from suso import eto

# Obtain secrets
Secrets for the various data sources must be stored in a config.yml file before conducting any analyses. 

In [2]:
DATA_DIR = os.path.join("..", "data")

CONFIG_FILE = "config.yml"

with open(CONFIG_FILE) as f:
    config = yaml.load(f)

# Connect to Lab SUSO Database

All of the data we need is contained in the MS SQL Server database created specifically for this purpose. 

In [3]:
def get_connection(config):
    config = copy.copy(config)
    config["uid"] = config["username"]
    config["pwd"] = config["password"]
    del config["username"]
    del config["password"]
    return pyodbc.connect(**config)

get tables

In [5]:
def table_to_df(table_name):
    conn = get_connection(config["db"])
    cursor = conn.cursor()
    cursor.execute(f"""SELECT * FROM {table_name}_new""")
    column_list = [column[0] for column in cursor.description]
    df = pd.DataFrame.from_records(cursor.fetchall(), columns=column_list)
    cursor.close()
    df_to_csv(df, table_name)
    globals()[table_name] = df
    return df


def df_to_csv(df, table_name):
    file_name = table_name + """.csv"""
    file_path = os.path.join(DATA_DIR, file_name)
    df.to_csv(file_path, encoding="utf-8")


tables = ["students", "randomizer", "jobs", "mailings", "status"]
for table_name in tables:
    table_to_df(table_name)

# Connect to ETO db


In [6]:
api = eto.ApiHandler()
api.login(config["eto"]["username"], config["eto"]["password"])

This is very slow, so we'll check for data first. 

In [7]:
begin_date = "2018-01-04"
end_date = datetime.now().strftime("%Y-%m-%d")

data_file = os.path.join(DATA_DIR, "eto_data.csv")

get_data = False
if get_data:
    if os.path.isfile(data_file):
        existing_data = pd.read_csv(data_file)
        begin_date = (
            pd.to_datetime(existing_data["start_date"]).max().strftime("%Y-%m-%d")
        )
        new_data = api.get_all_participants(begin_date, end_date)
        if len(new_data) < 2:
            eto_data = existing_data
        else:
            eto_data = pd.concat([existing_data, new_data])
        eto_data.to_csv(data_file, index=False)
    else:
        eto_data = api.get_all_participants(begin_date, end_date)
        eto_data.to_csv(data_file, index=False)
else:
    eto_data = pd.read_csv(data_file)

In [8]:
eto_data["start_delta"] = (
    pd.to_datetime(end_date) - pd.to_datetime(eto_data.start_date)
).dt.days
eto_data["two_weeks_up"] = [(x > 14) for x in eto_data["DaysInProgram"]]

In [9]:
eto_data[["start_delta", "DaysInProgram", "two_weeks_up"]].describe()

Unnamed: 0,start_delta,DaysInProgram
count,2315.0,2315.0
mean,152.017279,45.256156
std,39.368696,43.985644
min,63.0,0.0
25%,114.0,12.0
50%,156.0,26.0
75%,189.0,77.0
max,209.0,150.0


In [10]:
def parse_date(datestring):  # for .Net json tic crappiness
    timepart = datestring.split("(")[1].split(")")[0]
    milliseconds = int(timepart[:-5])
    hours = int(timepart[-5:]) / 100
    time = milliseconds / 1000
    dt = pd.to_datetime(datetime.utcfromtimestamp(time + hours * 3600))
    return dt


eto_data["start_date"] = eto_data.ProgramStartDate.apply(parse_date)

In [11]:
eto_data.drop(["CustomFields", "youth_club1", "youth_club2"], axis=1, inplace=True)

In [12]:
eto_data.to_csv(data_file, encoding="utf-8", index=False)