In [None]:
import pandas as pd
import glob
import numpy as np

# new files since april-12

path = '../../corona/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/*.csv'

files = glob.glob(path)

us_states = pd.read_csv('us_states.csv',sep=';',header=None,usecols=[0])
us_states_df = pd.DataFrame()

for f in files:
    date = f.split('/')[-1].split('.')[0]
    
    df = pd.read_csv(f,sep=',',usecols=[0,2,5,6])
    df['Last_Update'] = (pd.to_datetime(df['Last_Update']))
    df['date'] = pd.to_datetime([date] * len(df))

    df.rename(columns={'Province_State' : 'state',
                      'Confirmed' : 'confirmed',
                      'Deaths' : 'deceased'},inplace=True)
    df = pd.merge(df,us_states,left_on='state',right_on=0)
    df.drop(0,axis=1,inplace=True)
    df.set_index('date',inplace=True)

    
    us_states_df = pd.concat([us_states_df,df])
    
us_states_df

In [None]:
state_names = us_states_df['state'].unique()

state_dic = dict()

def process_state(state_name):
    
    state = (us_states_df[us_states_df['state'] == state_name]).copy()
    state['inc'] = state['confirmed'] - state['confirmed'].shift()
    state['dead_inc'] = state['deceased'] - state['deceased'].shift()
    state['factor'] = state['confirmed'] / state['confirmed'].shift()
    state['dead_factor'] = state['deceased'] / state['deceased'].shift()
    state_dic[state_name] = state


In [None]:
for s in state_names:
    process_state(s)
    
state_dic['New York']

In [None]:
# old data from files before april 12
old_state_dic = pd.read_pickle('us_state_dict_old.pkl')



In [None]:
#https://worldpopulationreview.com/states/
us_state_density = pd.read_csv('us_states_pop_density.csv')
us_state_density.set_index('State',inplace=True)

us_state_economy = pd.read_csv('us_states_economy.csv',header=None,sep=';',
                               index_col=[1],thousands=',')

sqr_mile_factor = 2.58998811

us_state_density['density'] *= sqr_mile_factor 

us_state_economy.drop([0,2,3,5],axis=1,inplace=True)
us_state_economy.rename(columns={4:'gdp'},inplace=True)
us_state_economy.index.name='state'


In [None]:
us_state_density

In [None]:
joined_state_dic = dict()

for k,v in old_state_dic.items():
    new_state =  pd.concat([v,state_dic[k]])
    new_state.index = (pd.to_datetime(new_state.index)).date
    new_state = new_state[['confirmed','deceased','inc','dead_inc',
                          'factor','dead_factor']]
    
    new_state['inc'] = new_state['confirmed'] - new_state['confirmed'].shift()
    new_state['dead_inc'] = new_state['deceased'] - new_state['deceased'].shift()
    new_state['factor'] = new_state['confirmed'] / new_state['confirmed'].shift()
    new_state['dead_factor'] = new_state['deceased'] / new_state['deceased'].shift()
    new_state['density'] = us_state_density.loc[k,'density']
    new_state['gdp'] = us_state_economy.loc[k,'gdp']
    new_state['pct_dead'] = 100 * new_state['deceased'] / new_state['confirmed']
    new_state['conf_per_M'] = new_state['confirmed'] / (us_state_density.loc[k,'Pop'] / 1e6)
    new_state['dead_per_M'] = new_state['deceased'] / (us_state_density.loc[k,'Pop'] / 1e6)
    new_state = new_state[new_state['deceased'] > 1]
    new_state.replace(np.inf,np.nan,inplace=True)
    
    joined_state_dic[k] = new_state


In [None]:
joined_state_dic['New York']

In [None]:
import pickle

with open ('us_state_dict.pkl','wb') as f:
    pickle.dump(joined_state_dic,f,pickle.HIGHEST_PROTOCOL)
    