In [22]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

us_states = pd.read_csv('us_states.csv',sep=';',header=None,usecols=[0],names=['state_name'])

#https://worldpopulationreview.com/states/
us_state_data = pd.read_csv('us_states_pop_density.csv')

us_state_economy = pd.read_csv('us_states_economy.csv',header=None,sep=';',
                               index_col=[1],thousands=',')


us_state_economy.rename(columns={4:'gdp'},inplace=True)
us_state_economy.drop([0,2,3,5],axis=1,inplace=True)

sqr_mile_factor = 2.58998811

us_state_data['density'] *= sqr_mile_factor
us_state_data.set_index('State',inplace=True)

def get_daily_data(f,state_name):
    df = pd.read_csv(f,sep=',')
    us = df.groupby('Country_Region').get_group('US')
    state_mask = us.loc[:,'Province_State'] == state_name
    state = us[state_mask].copy()
    state['Last_Update'] = pd.to_datetime(state['Last_Update'])
    state['Last_Update'] = state['Last_Update'].dt.date
    return state

def get_daily_data_2(f,state_name):
    df = pd.read_csv(f,sep=',')
    us = df.groupby('Country/Region').get_group('US')
    state_mask = us.loc[:,'Province/State'] == state_name
    state = us[state_mask].copy()
    state['Last Update'] = pd.to_datetime(state['Last Update'])
    state['Last Update'] = state['Last Update'].dt.date
    return state

us_state_economy.loc['Alabama']

gdp    47494
Name: Alabama, dtype: int64

In [23]:
import glob

file_prefix = '../../corona/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv'

files = [f for f in glob.glob(file_prefix)]
files = sorted(files)

file_s = pd.Series(files)
change_idx = 59


def process_state(state_name):

    timeline_df_1 = pd.DataFrame()
    timeline_df_2 = pd.DataFrame()

    for i,f in file_s.iteritems():
    
    
        if i > change_idx:
            temp = get_daily_data(f,state_name)
            timeline_df_1 = pd.concat([timeline_df_1,temp],axis=0)
        else:
            temp = get_daily_data_2(f,state_name)
            timeline_df_2 = pd.concat([timeline_df_2,temp],axis=0)
        
    
    timeline_df_1 = timeline_df_1.groupby(['Province_State','Last_Update']).sum()

    timeline_df_1.drop(['FIPS','Lat','Long_','Active'],axis=1,inplace=True)
    timeline_df_1.reset_index(inplace=True)
    timeline_df_2.drop(['Country/Region','Latitude','Longitude'],axis=1,inplace=True)
    timeline_df_2.rename(columns={'Province/State':'Province_State',
                             'Last Update':'Last_Update'},inplace=True)

    timeline_df = pd.concat([timeline_df_2,timeline_df_1],axis=0)
    #timeline_df.drop(203,inplace=True) # double entry
    #timeline_df.at[203,'Last_Update'] = pd.to_datetime('2020-03-13').date() # wrong date

    timeline_df['inc'] = timeline_df['Confirmed'] - timeline_df['Confirmed'].shift()
    timeline_df['inc_dead'] = timeline_df['Deaths'] - timeline_df['Deaths'].shift()
    timeline_df['factor'] = timeline_df['Confirmed'] / timeline_df['Confirmed'].shift()
    timeline_df['factor_dead'] = timeline_df['Deaths'] / timeline_df['Deaths'].shift()
    timeline_df.replace(np.inf,np.nan,inplace=True)

    timeline_df.columns = ['state','date','confirmed','dead','recovered','inc','inc_dead',
                       'factor','factor_dead']

    timeline_df.set_index('date',inplace=True)
    timeline_df.drop('state',axis=1,inplace=True)
    
    timeline_df['density'] = us_state_data.loc[state_name,'density']
    timeline_df['population'] = us_state_data.loc[state_name,'Pop']
    timeline_df['growth'] = us_state_data.loc[state_name,'Growth']
    timeline_df['gdp'] = us_state_economy.loc[state_name,'gdp']
    
    timeline_df.rename(columns={'dead': 'deceased',
                      'inc_dead':'dead_inc',
                      'factor_dead' :'dead_factor'},inplace=True)
    
    timeline_df['conf_per_M'] = timeline_df['confirmed'] / (us_state_data.loc[state_name,'Pop'] / 1e6)
    timeline_df['dead_per_M'] = timeline_df['deceased'] / (us_state_data.loc[state_name,'Pop'] / 1e6)
                    
    #timeline_df.to_pickle('US_states_{}_timeline.pkl'.format(state_name))
    return timeline_df





In [24]:
us_states['state_name'] = us_states['state_name'].str.strip()

In [25]:

us_state_dict = dict()

for i,s in us_states['state_name'].iteritems():
    
    state = process_state(s)
    us_state_dict[s] = state

ny = us_state_dict['New York']


In [26]:
ny.reset_index(inplace=True)

In [27]:
#patch erroneous date

ny.at[3,'date'] = pd.to_datetime('2020-03-13').date()
ny = ny.set_index('date')
us_state_dict['New York'] = ny

us_state_dict['New York']

Unnamed: 0_level_0,confirmed,deceased,recovered,inc,dead_inc,factor,dead_factor,density,population,growth,gdp,conf_per_M,dead_per_M
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-03-10,173.0,0.0,0.0,,,,,1068.424744,19440469,-0.0052,89076,8.898962,0.0
2020-03-11,220.0,0.0,0.0,47.0,0.0,1.271676,,1068.424744,19440469,-0.0052,89076,11.316599,0.0
2020-03-12,328.0,0.0,0.0,108.0,0.0,1.490909,,1068.424744,19440469,-0.0052,89076,16.872021,0.0
2020-03-13,421.0,0.0,0.0,93.0,0.0,1.283537,,1068.424744,19440469,-0.0052,89076,21.655856,0.0
2020-03-14,525.0,2.0,0.0,104.0,2.0,1.247031,,1068.424744,19440469,-0.0052,89076,27.005521,0.102878
2020-03-15,732.0,3.0,0.0,207.0,1.0,1.394286,1.5,1068.424744,19440469,-0.0052,89076,37.653413,0.154317
2020-03-16,967.0,10.0,0.0,235.0,7.0,1.321038,3.333333,1068.424744,19440469,-0.0052,89076,49.741598,0.514391
2020-03-17,1706.0,13.0,0.0,739.0,3.0,1.764219,1.3,1068.424744,19440469,-0.0052,89076,87.755085,0.668708
2020-03-18,2495.0,16.0,0.0,789.0,3.0,1.462485,1.230769,1068.424744,19440469,-0.0052,89076,128.340525,0.823025
2020-03-19,5365.0,34.0,0.0,2870.0,18.0,2.150301,2.125,1068.424744,19440469,-0.0052,89076,275.970708,1.748929


In [28]:
us_state_dict['New York']

import pickle

with open ('us_state_dict.pkl','wb') as f:
    pickle.dump(us_state_dict,f,pickle.HIGHEST_PROTOCOL)
    
