# Correlation between COVID daily cases and vaccination percentage

In [1]:
from scipy import stats
import pandas as pd
import altair as alt
import requests

In [2]:
# COVID data is obtained from NYT's github
state_df = pd.read_csv(
    'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-states.csv',
    parse_dates=['date'])

In [3]:
last_moving_average_df = (
    state_df
    .sort_values(by=['state','date'], ascending=True)
    .groupby('state')
    .tail(n=1)
)

In [4]:
# 
r=requests.get('https://covid.cdc.gov/covid-data-tracker/COVIDData/getAjaxData?id=vaccination_data')
json_data = r.json()

In [5]:
vaccination_df = pd.DataFrame(json_data['vaccination_data'])

In [6]:
vaccination_df.head().T.style

Unnamed: 0,0,1,2,3,4
Date,2021-06-20,2021-06-20,2021-06-20,2021-06-20,2021-06-20
Location,US,AK,AL,AR,AS
ShortName,USA,AKA,ALA,ARA,ASA
LongName,United States,Alaska,Alabama,Arkansas,American Samoa
Census2019,331996199.000000,731545.000000,4903185.000000,3017804.000000,55689.000000
date_type,Report,Report,Report,Report,Report
Doses_Distributed,379003410.000000,825625.000000,4758560.000000,2817700.000000,54030.000000
Doses_Administered,317966408,639941,3282193,2185888,45025
Dist_Per_100K,114159.000000,112860.000000,97050.000000,93369.000000,97021.000000
Admin_Per_100K,95774.000000,87478.000000,66940.000000,72433.000000,80851.000000


In [7]:
# merge covid moving average cases with administrated doses
vaccine_covid_df = (
    pd.merge(
        last_moving_average_df.loc[:,['state','cases_avg_per_100k', 'deaths_avg_per_100k']],
        vaccination_df
        .assign(state = lambda x: x.LongName)
        .assign(population = lambda x: x.Census2019)
        .loc[:, ['state','population', 'Admin_Per_100K', 'Admin_Per_100k_18Plus', 'Series_Complete_Pop_Pct', 'Series_Complete_12PlusPop_Pct']],
        on = 'state',
        how='inner'
    )
)

In [8]:
vaccine_covid_df.head()

Unnamed: 0,state,cases_avg_per_100k,deaths_avg_per_100k,population,Admin_Per_100K,Admin_Per_100k_18Plus,Series_Complete_Pop_Pct,Series_Complete_12PlusPop_Pct
0,Alabama,3.89,0.12,4903185.0,66940.0,84322.0,31.9,37.3
1,Alaska,4.3,0.0,731545.0,87478.0,110277.0,41.7,50.0
2,Arizona,5.77,0.15,7278717.0,87201.0,107879.0,39.0,45.7
3,Arkansas,7.57,0.09,3017804.0,72433.0,91373.0,33.3,39.3
4,California,2.27,0.06,39512223.0,104977.0,128122.0,48.1,56.4


In [9]:
corr, p_val = stats.spearmanr(vaccine_covid_df['Series_Complete_Pop_Pct'], vaccine_covid_df['cases_avg_per_100k'])

In [10]:
(
    alt.Chart(
        vaccine_covid_df, 
        title=[
            f'Correlation between cases per day and vaccination pct',
            f"Date for vaccination data {vaccination_df['Date'].tail(1).item()}",
            f"Date for COVID case data {last_moving_average_df['date'].tail(1).item()}",            
            f"Correlation test (Spearman) p-value: {p_val:0.4f}"
              ]
        )
    .mark_point()
    .encode(
        y='cases_avg_per_100k:Q',
        x=alt.X('Series_Complete_Pop_Pct:Q'),
        size = 'population',
        tooltip='state',
        fill='cases_avg_per_100k'
    )
)