In [1]:
import hashlib
from datetime import date, timedelta
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from scipy import stats as st

from thompson import codered, model, unvaccinated

In [2]:
def random_orderer(unique_id: str, password: str = "textmessages") -> str:
    """
    For consistency, use this function to create a random order of unique ids
    so that they appear in the same order every time
    """
    sha = hashlib.sha256()
    sha.update(password.encode("utf8"))
    sha.update(unique_id.encode("utf8"))
    return sha.hexdigest()

In [3]:
# Directory containing unvaccinated people data

DATA_DIR = Path.cwd() / ".." / ".." / ".." / "data" / "unvax_data"

In [4]:
# Settings
first_day_to_send = date(2021, 6, 2)
num_messages = 9  # Note that one is a control
num_days_to_send = 5
num_in_first_round = 40_000
uniform_probability = 0.25
seed = 3924857

In [5]:
# Read data
data_file = DATA_DIR / "input" / "unvax_contact_list_20210601_uniq_id.csv"
df = unvaccinated.read_unvaccinated_csv(data_file)

In [6]:
# Make sure unique ids are unique and also get length of dataframe
df["unique_id"].value_counts().value_counts()

1    148567
Name: unique_id, dtype: int64

In [7]:
# Explore primary language
df["primary_language"].value_counts(normalize=True)

English    0.995793
Spanish    0.003850
Portugu    0.000175
Other      0.000141
Haitian    0.000020
Chinese    0.000013
French     0.000007
Name: primary_language, dtype: float64

In [8]:
# We decided to only send messages in English and Spanish
# So change primary language appropriately
df["primary_language"] = df["primary_language"].apply(
    lambda x: "Spanish" if x == "Spanish" else "English"
)

df["primary_language"].value_counts(normalize=True)

English    0.99615
Spanish    0.00385
Name: primary_language, dtype: float64

In [9]:
# Starting in the second week we have Spanish language messages
# so do not filter
# df = df[df['primary_language'] == 'English']

### Determine missing people (i.e., people who were vaccinated)

In [10]:
# Read in previous weeks' data
old_df = pd.read_csv(DATA_DIR / "output" / "full-2021-05-25-for-week.csv")

In [11]:
# Look at results from the experiment up to this point
merged_df = old_df[["ContactId", "Groups"]].merge(
    df[["unique_id"]],
    left_on="ContactId",
    right_on="unique_id",
    how="left",
    indicator="_merge",
)

merged_df["got_vaccinated"] = merged_df["_merge"] == "left_only"
merged_df["treatment_assignment"] = merged_df["Groups"].str[-1].astype(int)

successes = merged_df.groupby("Groups")["got_vaccinated"].sum()
totals = merged_df.groupby("Groups").size().rename("total")
failures = (totals - successes).rename("not_vaccinated")

historic_data = pd.concat([successes, failures, totals], axis=1)

In [12]:
historic_data

Unnamed: 0_level_0,got_vaccinated,not_vaccinated,total
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
message_0,478,3967,4445
message_1,475,3970,4445
message_2,486,3959,4445
message_3,484,3961,4445
message_4,471,3973,4444
message_5,493,3951,4444
message_6,466,3978,4444
message_7,486,3958,4444
message_8,480,3964,4444


In [13]:
# Any actual differences?
st.chi2_contingency(historic_data[["got_vaccinated", "not_vaccinated"]].values)

(1.3133485304925052,
 0.995390763045943,
 8,
 array([[ 479.948875, 3965.051125],
        [ 479.948875, 3965.051125],
        [ 479.948875, 3965.051125],
        [ 479.948875, 3965.051125],
        [ 479.8409  , 3964.1591  ],
        [ 479.8409  , 3964.1591  ],
        [ 479.8409  , 3964.1591  ],
        [ 479.8409  , 3964.1591  ],
        [ 479.8409  , 3964.1591  ]]))

In [14]:
# Get assignments for the coming week
assignments, is_uniform = model.get_assignments(
    num_messages,
    merged_df["treatment_assignment"].values,
    merged_df["got_vaccinated"].values,
    num_in_first_round,
    uniform_probability,
    seed=seed,
)

In [15]:
# Remove anyone who has already received a message or was explicitly
# placed in the control group
merged_df = df.merge(
    old_df[["ContactId"]],
    left_on="unique_id",
    right_on="ContactId",
    how="left",
    indicator="_merge",
)

df = merged_df[merged_df["_merge"] == "left_only"].drop(columns=["_merge"])
print(f"Pool of people for this week: {len(df)}")

Pool of people for this week: 112886


### Do randomization

In [16]:
# We decided to _only_ do one stratum for now
df["stratum"] = 0

In [17]:
# Shuffle for randomization
df["random_order"] = df["unique_id"].apply(random_orderer)
df = df.sort_values("random_order")

# This is for the first round assignments
df = df.iloc[:num_in_first_round].reset_index(drop=True)

# Assign the message per stratum
df["message_num"] = assignments
df["is_chosen_from_uniform"] = is_uniform == 1
df["day_to_send"] = df.groupby(["stratum", "message_num"]).cumcount() % num_days_to_send

In [18]:
day_to_data = {
    day: [
        codered.CoderedContact(
            # Add an x to force Excel _not_ to convert numbers
            contact_id=row["unique_id"],
            first_name="N/A",
            last_name="N/A",
            groups=f"message_{row['message_num']}",
            # This is to be filled in by Zayid
            text_number=-1,
            tags=row["primary_language"],
            preferred_language=row["primary_language"],
        )
        for _, row in mini_df.iterrows()
    ]
    for day, mini_df in df.groupby("day_to_send")
}

In [19]:
# I can never remember the calendar constants....
SUNDAY = date(2021, 5, 30).weekday()
SATURDAY = date(2021, 5, 29).weekday()

In [20]:
files: List[Path] = []
full_dfs: List[pd.DataFrame] = []

weekend_offset = 0
for day, data in day_to_data.items():
    day_to_send = first_day_to_send + timedelta(days=day + weekend_offset)
    while day_to_send.weekday() in [SATURDAY, SUNDAY]:
        weekend_offset += 1
        day_to_send = first_day_to_send + timedelta(days=day + weekend_offset)

    filename = (
        DATA_DIR
        / "output"
        / f"{day_to_send.strftime('%Y-%m-%d')}_text_message_uniq_id.xlsx"
    )

    files.append(filename)
    codered.make_excel_file(filename, data, drop_message_0=True)

    this_df = codered.make_df_from_data(data)
    this_df["date_sent"] = day_to_send
    tmp_len = len(this_df)
    this_df = this_df.merge(
        df[["unique_id", "is_chosen_from_uniform"]],
        left_on="ContactId",
        right_on="unique_id",
    ).drop(columns="unique_id")
    assert len(this_df) == tmp_len
    full_dfs.append(this_df)

### Make sure outputs look reasonable

In [21]:
df = pd.read_excel(files[0])

In [22]:
# Make sure that there are ~1111 = 40k recipients / 4 days / 9 messages
df["Groups"].value_counts().sort_index()

message_1     619
message_2    1138
message_3    1018
message_4     486
message_5    1714
message_6     401
message_7    1127
message_8     796
Name: Groups, dtype: int64

In [23]:
# Make sure all of the ContactIds are unique
dfs = [pd.read_excel(filename) for filename in files]
df = pd.concat(dfs)
df["ContactId"].value_counts().value_counts()

1    36483
Name: ContactId, dtype: int64

In [24]:
# Make sure all the missing ContactIds are message_0
full_df = pd.concat(full_dfs)
joined = full_df[["ContactId", "Groups"]].merge(
    df[["ContactId", "Groups"]], on=["ContactId"], how="left"
)
joined["Groups_y"] = joined["Groups_y"].fillna("message_0")
assert (joined["Groups_x"] == joined["Groups_y"]).all()

In [25]:
# Write out the full_df for record keeping
full_df.to_csv(
    DATA_DIR / "output" / f'full-{first_day_to_send.strftime("%Y-%m-%d")}-for-week.csv',
    index=False,
)

In [26]:
full_df["Preferred Language"].value_counts()

English    39709
Spanish      291
Name: Preferred Language, dtype: int64