In [None]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import thinkstats2
from statsmodels.stats.power import TTestPower

##Seaborn for fancy plots. 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

# Covid Death Rates in Murica

Use this data to attempt the analysis below. 

In [None]:
df = pd.read_csv("data/covid_deaths.csv")
df.head()

In [None]:
sns.lineplot(data=df, x="Day", y="unvaccinated", ci=0)
sns.lineplot(data=df, x="Day", y="fully_vaccinated", ci=0)
sns.lineplot(data=df, x="Day", y="one_booster", ci=0)

## Is there a significant difference in death rates between Fully Vaccinated people and Unvaccinated people?

#### Create Datasets

In [None]:
fv = df["fully_vaccinated"]
un = df["unvaccinated"]

#### Explore Datasets

In [None]:
sns.kdeplot(fv)
sns.kdeplot(un)

In [None]:
fv.describe()

In [None]:
un.describe()

### Hypothesis Test

First I'll try a regular t-test, then I'll do the non-parametric version, the Mann-Whitney. 

In [None]:
# t-test
ss.ttest_ind(fv, un)

In [None]:
# mw
ss.mannwhitneyu(fv, un)

### Calculate Power

In [None]:
# Calculate inputs
ces = thinkstats2.CohenEffectSize(un, fv)
alpha = .05
nobs = fv.count() + un.count()

In [None]:
# calculate power
powerTest = TTestPower()
pow = powerTest.power(effect_size=ces, nobs=nobs, alpha=alpha)
pow, ces

### Result

Looks like a statistically significant difference, by every metric. 

## Is There a Significant Difference in Death Rates Between Fully Vaccinated and Boosted Populations*

*The boosted population has two groups - those with one booster and those with two. The one booster group is larger than the two booster group - there are more people with only one booster than there are with two. Use the below ratio to generate a weighted average rate. I.e. if the ratio is .2, that means that 20% of the total boosted population has two doses and 80% have one dose. 

In [None]:
two_dose_ratio = .2

In [None]:
b1 = df["one_booster"]
b2 = df["two_boosters"]
boost = (b1 * (1 - two_dose_ratio)) + (b2 * two_dose_ratio)
boost

#### Explore

In [None]:
sns.kdeplot(boost)
sns.kdeplot(fv)

#### Test

In [None]:
# t-test
print(ss.ttest_ind(fv, boost))
# mw
print(ss.mannwhitneyu(fv, boost))

#### Power

In [None]:
# Calculate inputs
ces2 = thinkstats2.CohenEffectSize(fv, boost)
alpha2 = .05
nobs2 = fv.count() + boost.count()
# calculate power
powerTest2 = TTestPower()
pow2 = powerTest.power(effect_size=ces2, nobs=nobs2, alpha=alpha2)
pow2, ces2

#### How Large of a Sample is Needed for a Power of .8?

In [None]:
need_samp = powerTest2.solve_power(alpha=.05, effect_size=ces2, power=.8)
need_samp, nobs2

#### More Sophisticated Rates

We can improve the approximation of the number of people with two boosters vs one booster. For this part, try to consider the following, and put this into your data:
<ul>
<li> There is a starting rate of two booster people (similar to above - the proportion of boosted people with 2 boosters). Try this being <b>.1</b>
<li> There is a final rate of two booster people. Try with this being <b>.3</b>
<li> Each day, the ratio of people with boosters who have two increases a set amount, starting with the first value (.1) on day 1, and the last (.3) on the last day. 
</ul>

<b>Note:</b> This is more of a challenge of manipulating the data than of the hypothesis testing. Once the datasets are established, the process is the same as above. The function "np.arrange" may be useful here, but there's probably lots of ways to do it. 

In [None]:
# Plot Ratios
start = .1
cap = .3
increment = (cap - start)/len(df)

x = np.arange(start, cap, increment)
x

In [None]:
# Add ratio to df
df["Ratio"] = x
df.head()

In [None]:
# Generate Weighted Boosted Rate

In [None]:
df["WeightBoost"] = (df["one_booster"] * (1-df["Ratio"])) + (df["two_boosters"] * df["Ratio"])
df.sample(10)

In [None]:
wboost = df["WeightBoost"]

### Do Test

In [None]:
# t-test
print(ss.ttest_ind(fv, wboost))
# mw
print(ss.mannwhitneyu(fv, wboost))

#### Power

In [None]:
# Calculate inputs
ces3 = thinkstats2.CohenEffectSize(fv, wboost)
alpha3 = .05
nobs3 = fv.count() + wboost.count()
# calculate power
powerTest3 = TTestPower()
pow3 = powerTest.power(effect_size=ces3, nobs=nobs3, alpha=alpha3)
pow3, ces3

## ANOVA

Is there a significant difference between any of the 3 vaccinated groups? Can you test it? 

In [None]:
#First, test for vars being equal-ish
varStat, varP = ss.levene(df['fully_vaccinated'],
               df['one_booster'],
               df['two_boosters'])
varP

In [None]:
df["fully_vaccinated"].var(), df["one_booster"].var(), df["two_boosters"].var()

It appears that the two boosters one has a radically different varaiance, that will make our ANOVA not really reliable. 