In [1]:
import re
from pathlib import Path

import pandas as pd
from scipy import stats as st

In [2]:
INPUT_DATA_DIR = Path.cwd() / ".." / ".." / ".." / "data" / "clickreports" / "input"
OUTPUT_DATA_DIR = Path.cwd() / ".." / ".." / ".." / "data" / "clickreports" / "output"

VAX_DATA_DIR = Path.cwd() / ".." / ".." / ".." / "data" / "unvax_data" / "output"

INPUT_DATA_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
dfs = []
for filename in INPUT_DATA_DIR.glob("*.xlsx"):
    df = pd.read_excel(filename, sheet_name="Dataset1")
    date_clicked = re.search(r"(\d{8})", filename.stem).groups()[0]
    df["date_clicked"] = pd.to_datetime(date_clicked, format="%Y%m%d")
    dfs.append(df)

df = pd.concat(dfs)

# Only consider those clicks before 2021-06-15, which we removed from the study
df = df[df["date_clicked"] < "2021-06-15"]

  warn("Workbook contains no default style, apply openpyxl's default")


In [4]:
# Extract the short link
df["short_page"] = df["Page"].str.extract(r"/(\w*)/")

In [5]:
# Convert short link to messages

original_pages = """
myvaccine
my_vaccine
vaccinelocator
vaccine_locator
findavaccine
find_a_vaccine
find_vaccines
locatevaccine
"""

later_pages = """
myvaccine
vaccineinfo
vaccinelocator
infovaccine
findavaccine
vaccinehelp
helpvaccine
locatevaccine
"""

spanish_pages = """
mivacuna
vacunainfo
localizadordevacuna
infovacuna
encontrarunavacuna
vacunaayuda
ayudavacuna
localizarvacuna
"""

name_to_message = {}
for pages in [original_pages, later_pages, spanish_pages]:
    new_names = {
        name: f"message_{i}" for i, name in enumerate(pages.strip().split(), 1)
    }
    for name, new_message in new_names.items():
        # Verify we're not overwriting data
        if name in name_to_message:
            assert name_to_message[name] == new_message

    name_to_message.update(new_names)

df["message"] = df["short_page"].map(name_to_message)

In [6]:
# Final counts of clicks
click_counts = df.groupby("message")["Sessions"].sum().rename("num_clicks")

In [7]:
# Read in main data
vax_df = pd.read_csv(VAX_DATA_DIR / "final_counts_by_group.csv")

# Remove last (dropped) day of study
vax_df = vax_df[vax_df["date_sent"] < "2021-06-15"]

# Remove control (message_0)
vax_df = vax_df[vax_df["assigned_message"] != "message_0"]

In [8]:
assigned_to_be_sent = (
    vax_df.groupby("assigned_message")["count"].sum().rename("num_messages")
)
got_vaccinated = (
    vax_df[~vax_df["did_not_get_vaccinated"]]
    .groupby("assigned_message")["count"]
    .sum()
    .rename("num_vaccinated")
)

In [9]:
final_df = pd.concat([click_counts, assigned_to_be_sent, got_vaccinated], axis=1)

In [10]:
final_df["pct_clicks"] = final_df["num_clicks"] / final_df["num_messages"]
final_df["pct_vaccinated"] = final_df["num_vaccinated"] / final_df["num_messages"]

In [11]:
final_df

Unnamed: 0,num_clicks,num_messages,num_vaccinated,pct_clicks,pct_vaccinated
message_1,365,10491,1158,0.034792,0.11038
message_2,224,12440,1443,0.018006,0.115997
message_3,277,11962,1358,0.023157,0.113526
message_4,201,10110,1133,0.019881,0.112067
message_5,309,15243,1786,0.020272,0.117169
message_6,1026,47058,4146,0.021803,0.088104
message_7,261,12363,1375,0.021111,0.111219
message_8,207,11434,1290,0.018104,0.112821


In [12]:
st.linregress(final_df["pct_clicks"], final_df["pct_vaccinated"])

LinregressResult(slope=-0.19445288361547405, intercept=0.11446576556312202, rvalue=-0.11413109017204728, pvalue=0.7878552700422851, stderr=0.6910153166873546, intercept_stderr=0.015692929963467852)