In [1]:
import hashlib
from datetime import date, timedelta
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from scipy import stats as st

from thompson import codered, model, unvaccinated

In [2]:
def random_orderer(unique_id: str, password: str = "textmessages") -> str:
    """
    For consistency, use this function to create a random order of unique ids
    so that they appear in the same order every time
    """
    sha = hashlib.sha256()
    sha.update(password.encode("utf8"))
    sha.update(unique_id.encode("utf8"))
    return sha.hexdigest()

In [3]:
# Directory containing unvaccinated people data

DATA_DIR = Path.cwd() / ".." / ".." / ".." / "data" / "unvax_data"

In [4]:
# Settings
first_day_to_send = date(2021, 6, 9)
num_messages = 9  # Note that one is a control
num_days_to_send = 5
# num_in_first_round = 40_000  # This week we're sending _everyone remaining_
uniform_probability = 0.33
seed = 3924857

In [5]:
# Read data
data_file = DATA_DIR / "input" / "unvax_contact_list_20210608_uniq_id.csv"
df = unvaccinated.read_unvaccinated_csv(data_file)

In [6]:
# Make sure unique ids are unique and also get length of dataframe
df["unique_id"].value_counts().value_counts()

1    151796
Name: unique_id, dtype: int64

In [7]:
# Explore primary language
df["primary_language"].value_counts(normalize=True)

English    0.995599
Spanish    0.004038
Portugu    0.000165
Other      0.000152
Chinese    0.000020
Haitian    0.000020
French     0.000007
Name: primary_language, dtype: float64

In [8]:
# We decided to only send messages in English and Spanish
# So change primary language appropriately
df["primary_language"] = df["primary_language"].apply(
    lambda x: "Spanish" if x == "Spanish" else "English"
)

df["primary_language"].value_counts(normalize=True)

English    0.995962
Spanish    0.004038
Name: primary_language, dtype: float64

In [9]:
# Starting in the second week we have Spanish language messages
# so do not filter
# df = df[df['primary_language'] == 'English']

### Determine missing people (i.e., people who were vaccinated)

In [10]:
# Read in previous weeks' data
old_df = pd.concat(
    [
        pd.read_csv(DATA_DIR / "output" / "full-2021-05-25-for-week.csv"),
        pd.read_csv(DATA_DIR / "output" / "full-2021-06-02-for-week.csv"),
    ]
)

# We ended up _not_ sending the Spanish messages. So remove the
# preferred language Spanish folks
old_df = old_df[(old_df["Preferred Language"] != "Spanish")].copy()

In [11]:
# Look at results from the experiment up to this point
merged_df = old_df[["ContactId", "Groups", "date_sent"]].merge(
    df[["unique_id"]],
    left_on="ContactId",
    right_on="unique_id",
    how="left",
    indicator="_merge",
)

merged_df["got_vaccinated"] = merged_df["_merge"] == "left_only"
merged_df["treatment_assignment"] = merged_df["Groups"].str[-1].astype(int)

successes = merged_df.groupby("Groups")["got_vaccinated"].sum()
totals = merged_df.groupby("Groups").size().rename("total")
failures = (totals - successes).rename("not_vaccinated")

historic_data = pd.concat([successes, failures, totals], axis=1)
historic_data["prop_vaccinated"] = (
    historic_data["got_vaccinated"] / historic_data["total"]
)

In [12]:
historic_data

Unnamed: 0_level_0,got_vaccinated,not_vaccinated,total,prop_vaccinated
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
message_0,647,7292,7939,0.081496
message_1,607,6907,7514,0.080783
message_2,790,9302,10092,0.07828
message_3,750,8748,9498,0.078964
message_4,555,6303,6858,0.080927
message_5,957,11989,12946,0.073922
message_6,580,5854,6434,0.090146
message_7,741,9294,10035,0.073842
message_8,680,7713,8393,0.08102


In [13]:
# Any actual differences?
st.chi2_contingency(historic_data[["got_vaccinated", "not_vaccinated"]].values)

(21.096583120804414,
 0.006895242723399697,
 8,
 array([[  628.17590234,  7310.82409766],
        [  594.54764205,  6919.45235795],
        [  798.53271274,  9293.46728726],
        [  751.53227365,  8746.46772635],
        [  542.64143321,  6315.35856679],
        [ 1024.3563713 , 11921.6436287 ],
        [  509.09229823,  5924.90770177],
        [  794.0225696 ,  9240.9774304 ],
        [  664.09879687,  7728.90120313]]))

In [14]:
# Remove anyone who has already received a message or was explicitly
# placed in the control group
tmp_df = df.merge(
    old_df[["ContactId"]],
    left_on="unique_id",
    right_on="ContactId",
    how="left",
    indicator="_merge",
)

df = tmp_df[tmp_df["_merge"] == "left_only"].drop(columns=["_merge"])
print(f"Pool of people for this week: {len(df)}")

Pool of people for this week: 78394


In [15]:
# Get assignments for the coming week
num_in_first_round = len(df)  # We're sending _all_ the remaining folks

assignments, is_uniform = model.get_assignments(
    num_messages,
    merged_df["treatment_assignment"].values,
    merged_df["got_vaccinated"].values,
    num_in_first_round,
    uniform_probability,
    seed=seed,
)

### Do randomization

In [16]:
# We decided to _only_ do one stratum for now
df["stratum"] = 0

In [17]:
# Shuffle for randomization
df["random_order"] = df["unique_id"].apply(random_orderer)
df = df.sort_values("random_order")

# This is for the first round assignments
df = df.iloc[:num_in_first_round].reset_index(drop=True)

# Assign the message per stratum
df["message_num"] = assignments
df["is_chosen_from_uniform"] = is_uniform == 1
df["day_to_send"] = df.groupby(["stratum", "message_num"]).cumcount() % num_days_to_send

In [18]:
day_to_data = {
    day: [
        codered.CoderedContact(
            # Add an x to force Excel _not_ to convert numbers
            contact_id=row["unique_id"],
            first_name="N/A",
            last_name="N/A",
            groups=f"message_{row['message_num']}",
            # This is to be filled in by Zayid
            text_number=-1,
            tags=row["primary_language"],
            preferred_language=row["primary_language"],
        )
        for _, row in mini_df.iterrows()
    ]
    for day, mini_df in df.groupby("day_to_send")
}

In [19]:
# I can never remember the calendar constants....
SUNDAY = date(2021, 5, 30).weekday()
SATURDAY = date(2021, 5, 29).weekday()

In [20]:
files: List[Path] = []
full_dfs: List[pd.DataFrame] = []

weekend_offset = 0
for day, data in day_to_data.items():
    day_to_send = first_day_to_send + timedelta(days=day + weekend_offset)
    while day_to_send.weekday() in [SATURDAY, SUNDAY]:
        weekend_offset += 1
        day_to_send = first_day_to_send + timedelta(days=day + weekend_offset)

    filename = (
        DATA_DIR
        / "output"
        / f"{day_to_send.strftime('%Y-%m-%d')}_text_message_uniq_id.xlsx"
    )

    files.append(filename)
    codered.make_excel_file(filename, data, drop_message_0=True)

    this_df = codered.make_df_from_data(data)
    this_df["date_sent"] = day_to_send

    tmp_len = len(this_df)
    this_df = this_df.merge(
        df[["unique_id", "is_chosen_from_uniform"]],
        left_on="ContactId",
        right_on="unique_id",
    ).drop(columns="unique_id")
    assert len(this_df) == tmp_len

    full_dfs.append(this_df)

### Make sure outputs look reasonable

In [21]:
df = pd.read_excel(files[0])

In [22]:
# Make sure that there are ~1111 = 40k recipients / 4 days / 9 messages
df["Groups"].value_counts().sort_index()

message_1      745
message_2      587
message_3      616
message_4      813
message_5      575
message_6    10156
message_7      582
message_8      761
Name: Groups, dtype: int64

In [23]:
# Make sure all of the ContactIds are unique
dfs = [pd.read_excel(filename) for filename in files]
df = pd.concat(dfs)
df["ContactId"].value_counts().value_counts()

1    74160
Name: ContactId, dtype: int64

In [24]:
# Make sure all the missing ContactIds are message_0
full_df = pd.concat(full_dfs)
joined = full_df[["ContactId", "Groups"]].merge(
    df[["ContactId", "Groups"]], on=["ContactId"], how="left"
)
joined["Groups_y"] = joined["Groups_y"].fillna("message_0")
assert (joined["Groups_x"] == joined["Groups_y"]).all()

In [25]:
# Write out the full_df for record keeping
full_df.to_csv(
    DATA_DIR / "output" / f'full-{first_day_to_send.strftime("%Y-%m-%d")}-for-week.csv',
    index=False,
)

In [26]:
full_df["Preferred Language"].value_counts()

English    77797
Spanish      597
Name: Preferred Language, dtype: int64

In [27]:
pos_df = merged_df.pivot_table(
    index="Groups",
    columns=pd.to_datetime(merged_df["date_sent"]).dt.dayofweek,
    values="got_vaccinated",
    aggfunc="sum",
)

mean_df = merged_df.pivot_table(
    index="Groups",
    columns=pd.to_datetime(merged_df["date_sent"]).dt.dayofweek,
    values="got_vaccinated",
    aggfunc="mean",
)

tot_df = (pos_df / mean_df).astype(int)

tot_df["total"] = tot_df.sum(axis=1)
tot_df = tot_df.append(tot_df.sum(axis=0).rename("total"))

pos_df["total"] = pos_df.sum(axis=1)
pos_df = pos_df.append(pos_df.sum(axis=0).rename("total"))

In [28]:
tot_df

date_sent,0,1,2,3,4,total
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
message_0,698,1813,1810,1808,1810,7939
message_1,612,1726,1727,1722,1727,7514
message_2,1124,2241,2242,2242,2243,10092
message_3,1011,2123,2121,2121,2122,9498
message_4,482,1592,1596,1592,1596,6858
message_5,1704,2810,2809,2811,2812,12946
message_6,398,1508,1509,1510,1508,6433
message_7,1121,2224,2228,2233,2229,10035
message_8,792,1902,1898,1901,1900,8393
total,7942,17939,17940,17940,17947,79708


In [29]:
(pos_df / tot_df).applymap(lambda x: f"{x:0.4f}")

date_sent,0,1,2,3,4,total
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
message_0,0.0501,0.0899,0.0845,0.0785,0.0851,0.0815
message_1,0.0605,0.0753,0.0921,0.0842,0.0787,0.0808
message_2,0.0641,0.0812,0.0812,0.0807,0.0771,0.0783
message_3,0.0544,0.081,0.0839,0.0778,0.0848,0.079
message_4,0.0394,0.0798,0.0733,0.0911,0.0921,0.0809
message_5,0.0493,0.0765,0.0751,0.0765,0.0825,0.0739
message_6,0.0603,0.0875,0.0968,0.1053,0.0789,0.0902
message_7,0.058,0.071,0.0745,0.082,0.0758,0.0738
message_8,0.0619,0.0925,0.0843,0.0752,0.08,0.081
total,0.0554,0.0811,0.0821,0.0824,0.0815,0.0791
