In [1]:
import re
from pathlib import Path

import pandas as pd
from scipy import stats as st

from thompson import unvaccinated

In [2]:
# Directory containing unvaccinated people data

DATA_DIR = Path.cwd() / ".." / ".." / ".." / "data" / "unvax_data"

ORIGINAL_FILES = [
    DATA_DIR / "input" / "unvax_contact_list_20210521_uniq_id.csv",
    DATA_DIR / "input" / "unvax_contact_list_20210601_uniq_id.csv",
    DATA_DIR / "input" / "unvax_contact_list_20210608_uniq_id.csv",
    DATA_DIR / "input" / "unvax_contact_list_20210622_uniq_id_updated.csv",
]

ASSIGNMENT_FILES = [
    DATA_DIR / "output" / "full-2021-05-25-for-week.csv",
    DATA_DIR / "output" / "full-2021-06-02-for-week.csv",
    DATA_DIR / "output" / "full-2021-06-09-for-week.csv",
]

VAX_DATA_FILE = (
    Path.cwd()
    / ".."
    / ".."
    / ".."
    / "data"
    / "unvax_data"
    / "input"
    / "final_vax_list.csv"
)

In [3]:
# Read original (demographic) data in
dfs = []
for data_file in ORIGINAL_FILES:
    df = unvaccinated.read_unvaccinated_csv(data_file)
    the_date = pd.to_datetime(
        re.search(r"(\d{8})", data_file.stem).groups()[0],
        format="%Y%m%d",
    )

    df["record_date"] = the_date
    dfs.append(df)

raw_df = pd.concat(dfs)

In [4]:
# How frequently do records change between pulls? Seems not too frequently
raw_df.drop("record_date", axis=1).groupby("unique_id").nunique().melt().groupby(
    ["variable", "value"]
).size()

variable          value
age               1        175344
                  2           991
                  3             3
city              1        175748
                  2           587
                  3             3
current_age       1        158424
                  2         17869
                  3            45
primary_language  0             1
                  1        176134
                  2           198
                  3             5
race_eth          0          5569
                  1        169216
                  2          1551
                  3             2
sex               0            33
                  1        174862
                  2          1441
                  3             2
test_result       1        175309
                  2          1029
zcta              1        175011
                  2          1324
                  3             3
dtype: int64

In [5]:
# How many unique_ids have at least one different record across time?
(raw_df.drop("record_date", axis=1).groupby("unique_id").nunique() == 1).all(
    axis=1
).mean()

0.8560888747745806

In [6]:
# Keep the first record we see for now
demo_df = raw_df.sort_values(by="record_date").drop_duplicates("unique_id")

In [7]:
# Read in message assignments
old_df = pd.concat([pd.read_csv(filename) for filename in ASSIGNMENT_FILES]).rename(
    columns={
        "ContactId": "unique_id",
        "Groups": "assigned_message",
        "Tags": "message_language",
    }
)[
    [
        "unique_id",
        "assigned_message",
        "message_language",
        "date_sent",
        "is_chosen_from_uniform",
    ]
]

old_df["date_sent"] = pd.to_datetime(old_df["date_sent"])

In [8]:
# Merge data
merged_df = old_df.merge(demo_df, on="unique_id")

In [9]:
# Some Spanish speaking folks appear twice for admin reasons; keep last record
merged_df = merged_df.sort_values(by="date_sent").drop_duplicates(
    "unique_id", keep="last"
)

In [10]:
# Drop people who didn't get message on last day
merged_df = merged_df[merged_df["date_sent"] < "2021-06-15"]

In [11]:
# Print descriptives
for col in ["age", "sex", "city", "race_eth"]:
    print(f"======== {col} =========")
    if col == "age":
        details = merged_df[col].describe()
        details["missing"] = merged_df[col].isna().sum()
        print(details)
    else:
        print(merged_df[col].value_counts(dropna=False))
    print()

count      142456.000000
mean           39.138801
std            16.944347
min            18.000000
25%            25.000000
50%            35.000000
75%            51.000000
max           120.000000
missing         0.000000
Name: age, dtype: float64

female     65310
male       62446
unknown    14674
NaN           26
Name: sex, dtype: int64

providence          30555
pawtucket           10448
cranston             8713
warwick              8221
woonsocket           6111
east providence      5469
newport              5128
bristol              4542
north providence     4215
cumberland           4106
coventry             3949
westerly             3642
west warwick         3614
johnston             3608
smithfield           3156
north kingstown      3069
south kingstown      2832
lincoln              2454
central falls        2416
tiverton             2415
middletown           2379
east greenwich       2302
portsmouth           2225
burrillville         2060
barrington           1635
narra

### Compare vaccination data to data we received for Thompson Sampler

In [12]:
# Read in main data
vax_df = pd.read_csv(VAX_DATA_FILE)

In [13]:
old_df["iteration"] = (
    3 - (old_df["date_sent"] <= "2021-05-28") - (old_df["date_sent"] <= "2021-06-08")
)

In [14]:
raw_df["record_date"].value_counts()

2021-05-21    162504
2021-06-22    151820
2021-06-08    151796
2021-06-01    148567
Name: record_date, dtype: int64

In [15]:
first_iteration_assigned = old_df[old_df["iteration"] == 1]
first_check = raw_df[raw_df["record_date"] == "2021-06-01"]
second_check = raw_df[raw_df["record_date"] == "2021-06-08"]
third_check = raw_df[raw_df["record_date"] == "2021-06-22"]

In [16]:
first_merge = (
    first_iteration_assigned.merge(
        first_check[["unique_id"]],
        how="left",
        on="unique_id",
        indicator="_merge_second_file",
    )
    .merge(
        second_check[["unique_id"]],
        how="left",
        on="unique_id",
        indicator="_merge_third_file",
    )
    .merge(
        third_check[["unique_id"]],
        how="left",
        on="unique_id",
        indicator="_merge_fourth_file",
    )
    .merge(
        vax_df.sort_values(by="admin_date")
        .drop_duplicates("unique_id")
        .loc[vax_df["admin_date"] < "2021-06-02", ["unique_id"]],
        how="left",
        on="unique_id",
        indicator="_merge_vax_file",
    )
)

In [17]:
for key in ["second", "third", "fourth"]:
    first_merge[f"vaccinated_by_{key}_file"] = (
        first_merge[f"_merge_{key}_file"] == "left_only"
    )

first_merge["vaccinated_by_vax_file"] = first_merge["_merge_vax_file"] == "both"

In [18]:
first_merge.groupby(
    ["vaccinated_by_second_file", "vaccinated_by_vax_file"]
).size().reset_index().pivot_table(
    index="vaccinated_by_second_file", columns="vaccinated_by_vax_file", values=0
)

vaccinated_by_vax_file,False,True
vaccinated_by_second_file,Unnamed: 1_level_1,Unnamed: 2_level_1
False,35447,234
True,3280,1039


In [19]:
first_merge.groupby(
    ["assigned_message", "vaccinated_by_second_file", "vaccinated_by_vax_file"]
).size().reset_index().pivot_table(
    index=["assigned_message", "vaccinated_by_second_file"],
    columns=["vaccinated_by_vax_file"],
    values=0,
)

Unnamed: 0_level_0,vaccinated_by_vax_file,False,True
assigned_message,vaccinated_by_second_file,Unnamed: 2_level_1,Unnamed: 3_level_1
message_0,False,3934,33
message_0,True,362,116
message_1,False,3950,20
message_1,True,362,113
message_2,False,3932,27
message_2,True,367,119
message_3,False,3933,28
message_3,True,360,124
message_4,False,3951,22
message_4,True,371,100


In [20]:
# What was the r for "vaccinated" according to the original file
# we received versus actually "vaccinated"
st.linregress(
    first_merge["vaccinated_by_second_file"], first_merge["vaccinated_by_vax_file"]
)

LinregressResult(slope=0.23400683342872236, intercept=0.006558112160533702, rvalue=0.41373048372576793, pvalue=0.0, stderr=0.0025746834932866896, intercept_stderr=0.0008460293995529085)

In [25]:
# What is the correlation in the "noise" by message, as measured by
# people who reappeared in future files
just_vaccinated_after_second_file = first_merge[
    first_merge["vaccinated_by_second_file"]
].copy()

just_vaccinated_after_second_file["did_reappear"] = (
    ~first_merge["vaccinated_by_third_file"] # | ~first_merge["vaccinated_by_fourth_file"]
)
just_vaccinated_after_second_file.groupby("assigned_message")["did_reappear"].mean()

assigned_message
message_0    0.393305
message_1    0.370526
message_2    0.409465
message_3    0.400826
message_4    0.405520
message_5    0.361055
message_6    0.381974
message_7    0.382716
message_8    0.385417
Name: did_reappear, dtype: float64

In [26]:
# Compute the chi-square test for this data
# Note that chi-square is appropriate because while stratification
# was by day, it was equal by day so this shouldn't matter
st.chi2_contingency(
    just_vaccinated_after_second_file.groupby("assigned_message")["did_reappear"]
    .value_counts()
    .rename("values")
    .reset_index()
    .pivot_table(index="assigned_message", columns="did_reappear", values="values")
    .values
)

(4.204046486944356,
 0.8382602669452339,
 8,
 array([[292.62144015, 185.37855985],
        [290.78490391, 184.21509609],
        [297.51887011, 188.48112989],
        [296.29451262, 187.70548738],
        [288.33618893, 182.66381107],
        [301.80412132, 191.19587868],
        [285.27529521, 180.72470479],
        [297.51887011, 188.48112989],
        [293.84579764, 186.15420236]]))

In [27]:
just_vaccinated_after_second_file['did_reappear'].mean()

0.3878212549201204