# COVID-19 EDA and Hypothesis Testing

### Data is sourced from:

https://github.com/nytimes/covid-19-data

### Other sources:

https://covidtracking.com/data/state/california/
https://covidtracking.com/api
https://covid19.healthdata.org/projections

In [1]:
import math

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import scipy.stats as stats
import random
from datetime import datetime,date

In [2]:
plt.style.use('fivethirtyeight')

## Explore and clean the data

#### Load the data into a Pandas DataFrame

In [3]:
us_covid = pd.read_csv('../datasets/covid-19-data/us-states.csv')
us_covid_cases = us_covid[['date', 'state', 'cases']].copy()
dates = us_covid['date'].unique()
states = us_covid['state'].unique()

#### Use a pivot table to move the data into columns/rows.

In [4]:
us_covid_cases_table = us_covid_cases.pivot(index='state', columns='date', values='cases')
us_covid_cases_table.fillna(0, inplace=True)

#### Convert column names to date time objects.

In [5]:
us_covid_cases_table.columns = pd.to_datetime(us_covid_cases_table.columns)

#### Display table for number of cases

In [6]:
us_covid_cases_table.head()

date,2020-01-21,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,...,2020-03-27,2020-03-28,2020-03-29,2020-03-30,2020-03-31,2020-04-01,2020-04-02,2020-04-03,2020-04-04,2020-04-05
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,639.0,720.0,830.0,947.0,999.0,1106.0,1270.0,1535.0,1633.0,1841.0
Alaska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,85.0,102.0,114.0,119.0,133.0,143.0,146.0,156.0,169.0,185.0
Arizona,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,665.0,773.0,929.0,1169.0,1298.0,1413.0,1600.0,1769.0,2019.0,2269.0
Arkansas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,386.0,409.0,449.0,508.0,564.0,624.0,683.0,738.0,743.0,853.0
California,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,...,4914.0,5565.0,6266.0,7421.0,8582.0,9816.0,10995.0,12569.0,13796.0,15076.0


## For this study, I'm only considering percent increases once the state has over 10 cases. 

That way, I am getting rid of the bias of those first couple of days (e.g., when a state goes from 1 case to 2 cases, it's a 100% increase).

In [7]:
us_covid_cases_table[us_covid_cases_table < 10] = 0

#### Create empty array for the daily increases percent = (that days # - prev days #) / prev day

In [8]:
us_covid_increases = pd.DataFrame(index=us_covid_cases_table.index.values, 
                                  columns=us_covid_cases_table.columns.values)
us_covid_increases.fillna(0, inplace=True)


#### Fill empty array with the daily increases from the us_covid_cases_table data

In [9]:
for each_date in us_covid_increases.columns.values[1:]:
    us_covid_increases[each_date] = ((us_covid_cases_table[each_date] -
                                         us_covid_cases_table[each_date - pd.Timedelta(days=1)]) / 
                                         us_covid_cases_table[each_date - pd.Timedelta(days=1)]) * 100

#### Display table for percent increases

In [10]:
#us_covid_increases

#### Change 'NaN' and 'inf' values to 0 

NaN occurs when there were no record cases on that day and the day before (e.g., 0/0)

Inf occurs when there was a case reported on that day but none on the day before (e.g., 1/0)

In [11]:
us_covid_increases.fillna(0, inplace=True)
us_covid_increases.replace(np.inf, 0, inplace=True)

In [12]:
us_covid_increases.head()

Unnamed: 0,2020-01-21,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,...,2020-03-27,2020-03-28,2020-03-29,2020-03-30,2020-03-31,2020-04-01,2020-04-02,2020-04-03,2020-04-04,2020-04-05
Alabama,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.773234,12.676056,15.277778,14.096386,5.491024,10.710711,14.82821,20.866142,6.384365,12.737293
Alaska,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.188406,20.0,11.764706,4.385965,11.764706,7.518797,2.097902,6.849315,8.333333,9.467456
Arizona,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.905512,16.240602,20.181113,25.83423,11.035073,8.859784,13.234253,10.5625,14.132278,12.382368
Arkansas,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.97151,5.958549,9.779951,13.140312,11.023622,10.638298,9.455128,8.052709,0.677507,14.804845
California,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21.034483,13.247863,12.596586,18.432812,15.644792,14.378933,12.011002,14.315598,9.762113,9.278052


#### Reset the index for the table to be named 'state'

In [13]:
us_covid_increases.index.name = 'state'

In [14]:
us_covid_increases.head()

Unnamed: 0_level_0,2020-01-21,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,...,2020-03-27,2020-03-28,2020-03-29,2020-03-30,2020-03-31,2020-04-01,2020-04-02,2020-04-03,2020-04-04,2020-04-05
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.773234,12.676056,15.277778,14.096386,5.491024,10.710711,14.82821,20.866142,6.384365,12.737293
Alaska,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.188406,20.0,11.764706,4.385965,11.764706,7.518797,2.097902,6.849315,8.333333,9.467456
Arizona,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.905512,16.240602,20.181113,25.83423,11.035073,8.859784,13.234253,10.5625,14.132278,12.382368
Arkansas,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.97151,5.958549,9.779951,13.140312,11.023622,10.638298,9.455128,8.052709,0.677507,14.804845
California,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21.034483,13.247863,12.596586,18.432812,15.644792,14.378933,12.011002,14.315598,9.762113,9.278052


#### Make the columns for dates be datetime objects.

In [16]:
type(us_covid_increases.columns.values[0])

numpy.datetime64

In [17]:
type(pd.to_datetime(us_covid_increases.columns.values[0]))

pandas._libs.tslibs.timestamps.Timestamp

In [24]:
us_covid_increases.columns.values


array(['2020-01-21T00:00:00.000000000', '2020-01-22T00:00:00.000000000',
       '2020-01-23T00:00:00.000000000', '2020-01-24T00:00:00.000000000',
       '2020-01-25T00:00:00.000000000', '2020-01-26T00:00:00.000000000',
       '2020-01-27T00:00:00.000000000', '2020-01-28T00:00:00.000000000',
       '2020-01-29T00:00:00.000000000', '2020-01-30T00:00:00.000000000',
       '2020-01-31T00:00:00.000000000', '2020-02-01T00:00:00.000000000',
       '2020-02-02T00:00:00.000000000', '2020-02-03T00:00:00.000000000',
       '2020-02-04T00:00:00.000000000', '2020-02-05T00:00:00.000000000',
       '2020-02-06T00:00:00.000000000', '2020-02-07T00:00:00.000000000',
       '2020-02-08T00:00:00.000000000', '2020-02-09T00:00:00.000000000',
       '2020-02-10T00:00:00.000000000', '2020-02-11T00:00:00.000000000',
       '2020-02-12T00:00:00.000000000', '2020-02-13T00:00:00.000000000',
       '2020-02-14T00:00:00.000000000', '2020-02-15T00:00:00.000000000',
       '2020-02-16T00:00:00.000000000', '2020-02-17

In [21]:
type(us_covid_increases.columns.values[0])

numpy.datetime64

## Load in the dates for stay-at-home orders

Dates were scrapped from https://www.kff.org/coronavirus-policy-watch/stay-at-home-orders-to-fight-covid19/

In [None]:
stay_home_dates = pd.read_pickle('../datasets/stay_home_orders_pickled.pkl')

In [None]:
stay_home_dates.head()

## Merge the percent increases and the stay-at-home order date dataframes

In [None]:
merged = pd.merge(us_covid_increases, stay_home_dates, on='state')

In [None]:
merged.head()

In [None]:
type(merged.columns.values[0])

In [None]:
merged.columns.values[0]

In [None]:
merged.columns.values = pd.to_datetime(merged.columns.values).dt.date

## Plot the Percent Increases Over Time

In [None]:
def plot_percent_changes(ax, data, dates, label):
    '''Plots the dates/data on the given axis and applies a label to it.'''
    data_ = data[data>0] #get only the data that is more than 0
    dates_ = dates[data>0] #get only the dates that the data is more than 0
    ax.plot(dates_, data_, label = label)

In [None]:
def generate_labels(ax):
    ax.legend(fontsize=20)
    fig.autofmt_xdate()
    start, end = ax.get_xlim()
    ax.xaxis.set_ticks(np.arange(start, end, 4));
    ax.set_ylabel('Percent Change Per Day')
    ax.yaxis.label.set_size(20)
    ax.set_xlabel('Date')
    ax.xaxis.label.set_size(20)

In [None]:
fig, ax = plt.subplots(figsize=(16,14))
plot_percent_changes(ax, us_covid_increases.loc['Washington', :], 
                     us_covid_cases_table.columns.values, 'Washington')
plot_percent_changes(ax, us_covid_increases.loc['New York', :], 
                     us_covid_cases_table.columns.values, 'New York')
generate_labels(ax)

In [None]:
fig, ax = plt.subplots(figsize=(16,14))
plot_percent_changes(ax, us_covid_increases.loc['Washington', :], 
                     us_covid_cases_table.columns.values, 'Washington')
plot_percent_changes(ax, us_covid_increases.loc['New York', :], 
                     us_covid_cases_table.columns.values, 'New York')
generate_labels(ax)