In [1]:
import hashlib
from datetime import date, timedelta
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd

from thompson import codered, unvaccinated

In [2]:
def random_orderer(unique_id: str, password: str = "textmessages") -> str:
    """
    For consistency, use this function to create a random order of unique ids
    so that they appear in the same order every time
    """
    sha = hashlib.sha256()
    sha.update(password.encode("utf8"))
    sha.update(unique_id.encode("utf8"))
    return sha.hexdigest()

In [3]:
# Directory containing unvaccinated people data

DATA_DIR = Path.cwd() / ".." / ".." / ".." / "data" / "unvax_data"

In [4]:
# Settings
first_day_to_send = date(2021, 5, 25)
num_messages = 9  # Note that one is a control
num_days_to_send = 4
num_in_first_round = 40_000

In [5]:
# Read data
data_file = DATA_DIR / "input" / "unvax_contact_list_20210521_uniq_id.csv"
df = unvaccinated.read_unvaccinated_csv(data_file)

In [6]:
# Make sure unique ids are unique and also get length of dataframe
df["unique_id"].value_counts().value_counts()

1    162504
Name: unique_id, dtype: int64

In [7]:
# Explore primary language
df["primary_language"].value_counts(normalize=True)

English    0.995299
Spanish    0.004277
Portugu    0.000234
Other      0.000148
Haitian    0.000018
Chinese    0.000018
French     0.000006
Name: primary_language, dtype: float64

In [8]:
# We decided to only send messages in English and Spanish
# So change primary language appropriately
df["primary_language"] = df["primary_language"].apply(
    lambda x: "Spanish" if x == "Spanish" else "English"
)

df["primary_language"].value_counts(normalize=True)

English    0.995723
Spanish    0.004277
Name: primary_language, dtype: float64

In [9]:
# For now we do _not_ have Spanish language media, so we will
# drop Spanish language preference and wait for them next week
df = df[df["primary_language"] == "English"]

### Do randomization

In [10]:
# We decided to _only_ do one stratum for now
df["stratum"] = 0

In [11]:
# Shuffle for randomization
df["random_order"] = df["unique_id"].apply(random_orderer)
df = df.sort_values("random_order")

# This is for the first round assignments
df = df.iloc[:num_in_first_round].reset_index(drop=True)

# Assign the message per stratum
df["message_num"] = df.groupby("stratum").cumcount() % num_messages
df["day_to_send"] = df.groupby(["stratum", "message_num"]).cumcount() % num_days_to_send

In [12]:
day_to_data = {
    day: [
        codered.CoderedContact(
            # Add an x to force Excel _not_ to convert numbers
            contact_id=row["unique_id"],
            first_name="N/A",
            last_name="N/A",
            groups=f"message_{row['message_num']}",
            # This is to be filled in by Zayid
            text_number=-1,
            tags=row["primary_language"],
            preferred_language=row["primary_language"],
        )
        for _, row in mini_df.iterrows()
    ]
    for day, mini_df in df.groupby("day_to_send")
}

In [13]:
files: List[Path] = []
full_dfs: List[pd.DataFrame] = []

for day, data in day_to_data.items():
    day_to_send = first_day_to_send + timedelta(days=day)
    filename = (
        DATA_DIR
        / "output"
        / f"{day_to_send.strftime('%Y-%m-%d')}_text_message_uniq_id.xlsx"
    )

    files.append(filename)
    codered.make_excel_file(filename, data, drop_message_0=True)

    this_df = codered.make_df_from_data(data)
    this_df["date_sent"] = day_to_send
    full_dfs.append(this_df)

### Make sure outputs look reasonable

In [14]:
df = pd.read_excel(files[0])

In [15]:
# Make sure that there are ~1111 = 40k recipients / 4 days / 9 messages
df["Groups"].value_counts().sort_index()

message_1    1112
message_2    1112
message_3    1112
message_4    1111
message_5    1111
message_6    1111
message_7    1111
message_8    1111
Name: Groups, dtype: int64

In [16]:
# Make sure all of the ContactIds are unique
dfs = [pd.read_excel(filename) for filename in files]
df = pd.concat(dfs)
df["ContactId"].value_counts().value_counts()

1    35555
Name: ContactId, dtype: int64

In [17]:
# Make sure all the missing ContactIds are message_0
full_df = pd.concat(full_dfs)
joined = full_df[["ContactId", "Groups"]].merge(
    df[["ContactId", "Groups"]], on=["ContactId"], how="left"
)
joined["Groups_y"] = joined["Groups_y"].fillna("message_0")
full_df["is_chosen_from_uniform"] = True
assert (joined["Groups_x"] == joined["Groups_y"]).all()

In [18]:
# Write out the full_df for record keeping
full_df.to_csv(DATA_DIR / "output" / "full-2021-05-25-for-week.csv", index=False)

In [19]:
full_df["Preferred Language"].value_counts()

English    40000
Name: Preferred Language, dtype: int64