# Correlation between COVID daily cases and vaccination percentage

In [1]:
from scipy import stats
import pandas as pd
import altair as alt
import requests

In [2]:
# COVID data is obtained from NYT's github
state_df = pd.read_csv(
    'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-states.csv',
    parse_dates=['date'])

In [3]:
last_moving_average_df = (
    state_df
    .sort_values(by=['state','date'], ascending=True)
    .groupby('state')
    .tail(n=1)
)

In [4]:
# Vaccination data is from CDC website
r=requests.get('https://covid.cdc.gov/covid-data-tracker/COVIDData/getAjaxData?id=vaccination_data')
json_data = r.json()

In [5]:
vaccination_df = pd.DataFrame(json_data['vaccination_data'])

In [6]:
# merge covid moving average cases with administrated doses
vaccine_covid_df = (
    pd.merge(
        last_moving_average_df.loc[:,['state','cases_avg_per_100k', 'deaths_avg_per_100k']],
        vaccination_df
        .assign(state = lambda x: x.LongName)
        .assign(population = lambda x: x.Census2019)
        .loc[:, ['state','population', 'Admin_Per_100K', 'Admin_Per_100k_18Plus', 'Series_Complete_Pop_Pct', 'Series_Complete_12PlusPop_Pct']],
        on = 'state',
        how='inner'
    )
)

In [7]:
corr, p_val = stats.spearmanr(vaccine_covid_df['Series_Complete_Pop_Pct'], vaccine_covid_df['cases_avg_per_100k'])

In [8]:
(
    alt.Chart(
        vaccine_covid_df, 
        title=[
            f'Correlation between cases per day and vaccination pct',
            f"Date for vaccination data {vaccination_df['Date'].tail(1).item()}",
            f"Date for COVID case data {last_moving_average_df['date'].tail(1).item()}",            
            f"Correlation test (Spearman) p-value: {p_val:0.4f}"
              ]
        )
    .mark_point()
    .encode(
        y='cases_avg_per_100k:Q',
        x=alt.X('Series_Complete_Pop_Pct:Q'),
        size = 'population',
        tooltip='state',
        fill='cases_avg_per_100k'
    )
)