In [165]:
import pandas as pd
import glob
import numpy as np

# new files since april-12

path = '../../corona/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/*.csv'

files = glob.glob(path)

us_states = pd.read_csv('us_states.csv',sep=';',header=None,usecols=[0])
us_states_df = pd.DataFrame()

for f in files:
    date = f.split('/')[-1].split('.')[0]
    
    df = pd.read_csv(f,sep=',',usecols=[0,2,5,6])
    df['Last_Update'] = (pd.to_datetime(df['Last_Update']))
    df['date'] = pd.to_datetime([date] * len(df))

    df.rename(columns={'Province_State' : 'state',
                      'Confirmed' : 'confirmed',
                      'Deaths' : 'deceased'},inplace=True)
    df = pd.merge(df,us_states,left_on='state',right_on=0)
    df.drop(0,axis=1,inplace=True)
    df.set_index('date',inplace=True)

    
    us_states_df = pd.concat([us_states_df,df])
    
us_states_df

Unnamed: 0_level_0,state,Last_Update,confirmed,deceased
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-12,Alabama,2020-04-12 23:18:15,3563,93
2020-04-12,Alaska,2020-04-12 23:18:15,272,8
2020-04-12,Arizona,2020-04-12 23:18:15,3542,115
2020-04-12,Arkansas,2020-04-12 23:18:15,1280,27
2020-04-12,California,2020-04-12 23:18:15,22795,640
...,...,...,...,...
2020-04-26,Virginia,2020-04-27 02:32:46,12970,449
2020-04-26,Washington,2020-04-27 02:32:46,13521,749
2020-04-26,West Virginia,2020-04-27 02:32:46,1055,34
2020-04-26,Wisconsin,2020-04-27 02:32:46,5912,272


In [166]:
state_names = us_states_df['state'].unique()

state_dic = dict()

def process_state(state_name):
    
    state = (us_states_df[us_states_df['state'] == state_name]).copy()
    state['inc'] = state['confirmed'] - state['confirmed'].shift()
    state['dead_inc'] = state['deceased'] - state['deceased'].shift()
    state['factor'] = state['confirmed'] / state['confirmed'].shift()
    state['dead_factor'] = state['deceased'] / state['deceased'].shift()
    state_dic[state_name] = state


In [167]:
for s in state_names:
    process_state(s)
    
state_dic['New York']

Unnamed: 0_level_0,state,Last_Update,confirmed,deceased,inc,dead_inc,factor,dead_factor
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-04-12,New York,2020-04-12 23:18:15,189033,9385,,,,
2020-04-13,New York,2020-04-13 23:07:54,195749,10058,6716.0,673.0,1.035528,1.07171
2020-04-14,New York,2020-04-14 23:33:31,203020,10842,7271.0,784.0,1.037145,1.077948
2020-04-15,New York,2020-04-15 22:56:51,214454,11617,11434.0,775.0,1.05632,1.071481
2020-04-16,New York,2020-04-16 23:30:51,223691,14832,9237.0,3215.0,1.043072,1.27675
2020-04-17,New York,2020-04-17 23:30:52,230597,17131,6906.0,2299.0,1.030873,1.155003
2020-04-18,New York,2020-04-18 22:32:47,241712,17671,11115.0,540.0,1.048201,1.031522
2020-04-19,New York,2020-04-19 23:41:01,247815,18298,6103.0,627.0,1.025249,1.035482
2020-04-20,New York,2020-04-20 23:36:47,253060,18611,5245.0,313.0,1.021165,1.017106
2020-04-21,New York,2020-04-21 23:40:34,258361,19104,5301.0,493.0,1.020948,1.02649


In [168]:
# old data from files before april 12
old_state_dic = pd.read_pickle('us_state_dict_old.pkl')



In [169]:
#https://worldpopulationreview.com/states/
us_state_density = pd.read_csv('us_states_pop_density.csv')
us_state_density.set_index('State',inplace=True)

us_state_economy = pd.read_csv('us_states_economy.csv',header=None,sep=';',
                               index_col=[1],thousands=',')

sqr_mile_factor = 2.58998811

us_state_density['density'] *= sqr_mile_factor 

us_state_economy.drop([0,2,3,5],axis=1,inplace=True)
us_state_economy.rename(columns={4:'gdp'},inplace=True)
us_state_economy.index.name='state'


In [173]:
joined_state_dic = dict()

for k,v in old_state_dic.items():
    new_state =  pd.concat([v,state_dic[k]])
    new_state.index = (pd.to_datetime(new_state.index)).date
    new_state = new_state[['confirmed','deceased','inc','dead_inc',
                          'factor','dead_factor']]
    
    new_state['inc'] = new_state['confirmed'] - new_state['confirmed'].shift()
    new_state['dead_inc'] = new_state['deceased'] - new_state['deceased'].shift()
    new_state['factor'] = new_state['confirmed'] / new_state['confirmed'].shift()
    new_state['dead_factor'] = new_state['deceased'] / new_state['deceased'].shift()
    new_state['density'] = us_state_density.loc[k,'density']
    new_state['gdp'] = us_state_economy.loc[k,'gdp']
    new_state['pct_dead'] = 100 * new_state['deceased'] / new_state['confirmed']
    new_state = new_state[new_state['deceased'] > 1]
    new_state.replace(np.inf,np.nan,inplace=True)
    
    joined_state_dic[k] = new_state



In [174]:
joined_state_dic['New York']

Unnamed: 0,confirmed,deceased,inc,dead_inc,factor,dead_factor,density,gdp,pct_dead
2020-03-14,525.0,2.0,104.0,2.0,1.247031,,1068.424744,89076,0.380952
2020-03-15,732.0,3.0,207.0,1.0,1.394286,1.5,1068.424744,89076,0.409836
2020-03-16,967.0,10.0,235.0,7.0,1.321038,3.333333,1068.424744,89076,1.034126
2020-03-17,1706.0,13.0,739.0,3.0,1.764219,1.3,1068.424744,89076,0.762016
2020-03-18,2495.0,16.0,789.0,3.0,1.462485,1.230769,1068.424744,89076,0.641283
2020-03-19,5365.0,34.0,2870.0,18.0,2.150301,2.125,1068.424744,89076,0.633737
2020-03-20,8310.0,42.0,2945.0,8.0,1.548928,1.235294,1068.424744,89076,0.505415
2020-03-21,11710.0,60.0,3400.0,18.0,1.409146,1.428571,1068.424744,89076,0.512383
2020-03-22,15800.0,117.0,4090.0,57.0,1.349274,1.95,1068.424744,89076,0.740506
2020-03-23,20884.0,158.0,5084.0,41.0,1.321772,1.350427,1068.424744,89076,0.75656


In [175]:
import pickle

with open ('us_state_dict.pkl','wb') as f:
    pickle.dump(joined_state_dic,f,pickle.HIGHEST_PROTOCOL)
    