In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

us_states = pd.read_csv('us_states.csv',sep=';',header=None,usecols=[0],names=['state_name'])

#https://worldpopulationreview.com/states/
us_state_data = pd.read_csv('us_states_pop_density.csv')

sqr_mile_factor = 2.58998811

us_state_data['density'] *= sqr_mile_factor
us_state_data.set_index('State',inplace=True)

def get_daily_data(f,state_name):
    df = pd.read_csv(f,sep=',')
    us = df.groupby('Country_Region').get_group('US')
    state_mask = us.loc[:,'Province_State'] == state_name
    state = us[state_mask].copy()
    state['Last_Update'] = pd.to_datetime(state['Last_Update'])
    state['Last_Update'] = state['Last_Update'].dt.date
    return state

def get_daily_data_2(f,state_name):
    df = pd.read_csv(f,sep=',')
    us = df.groupby('Country/Region').get_group('US')
    state_mask = us.loc[:,'Province/State'] == state_name
    state = us[state_mask].copy()
    state['Last Update'] = pd.to_datetime(state['Last Update'])
    state['Last Update'] = state['Last Update'].dt.date
    return state

us_state_data

In [None]:
import glob

file_prefix = '../../corona/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv'

files = [f for f in glob.glob(file_prefix)]
files = sorted(files)

file_s = pd.Series(files)
change_idx = 59


def process_state(state_name):

    timeline_df_1 = pd.DataFrame()
    timeline_df_2 = pd.DataFrame()

    for i,f in file_s.iteritems():
    
    
        if i > change_idx:
            temp = get_daily_data(f,state_name)
            timeline_df_1 = pd.concat([timeline_df_1,temp],axis=0)
        else:
            temp = get_daily_data_2(f,state_name)
            timeline_df_2 = pd.concat([timeline_df_2,temp],axis=0)
        
    
    timeline_df_1 = timeline_df_1.groupby(['Province_State','Last_Update']).sum()

    timeline_df_1.drop(['FIPS','Lat','Long_','Active'],axis=1,inplace=True)
    timeline_df_1.reset_index(inplace=True)
    timeline_df_2.drop(['Country/Region','Latitude','Longitude'],axis=1,inplace=True)
    timeline_df_2.rename(columns={'Province/State':'Province_State',
                             'Last Update':'Last_Update'},inplace=True)

    timeline_df = pd.concat([timeline_df_2,timeline_df_1],axis=0)
    #timeline_df.drop(203,inplace=True) # double entry
    #timeline_df.at[203,'Last_Update'] = pd.to_datetime('2020-03-13').date() # wrong date

    timeline_df['inc'] = timeline_df['Confirmed'] - timeline_df['Confirmed'].shift()
    timeline_df['inc_dead'] = timeline_df['Deaths'] - timeline_df['Deaths'].shift()
    timeline_df['factor'] = timeline_df['Confirmed'] / timeline_df['Confirmed'].shift()
    timeline_df['factor_dead'] = timeline_df['Deaths'] / timeline_df['Deaths'].shift()
    timeline_df.replace(np.inf,np.nan,inplace=True)

    timeline_df.columns = ['state','date','confirmed','dead','recovered','inc','inc_dead',
                       'factor','factor_dead']

    timeline_df.set_index('date',inplace=True)
    timeline_df.drop('state',axis=1,inplace=True)
    
    timeline_df['density'] = us_state_data.loc[state_name,'density']
    timeline_df['population'] = us_state_data.loc[state_name,'Pop']
    
    
    timeline_df.rename(columns={'dead': 'deceased',
                      'inc_dead':'dead_inc',
                      'factor_dead' :'dead_factor'},inplace=True)
    
    timeline_df['conf_per_M'] = timeline_df['confirmed'] / (us_state_data.loc[state_name,'Pop'] / 1e6)
    timeline_df['dead_per_M'] = timeline_df['deceased'] / (us_state_data.loc[state_name,'Pop'] / 1e6)
                    
    #timeline_df.to_pickle('US_states_{}_timeline.pkl'.format(state_name))
    return timeline_df





In [None]:
us_states['state_name'] = us_states['state_name'].str.strip()

In [None]:

us_state_dict = dict()

for i,s in us_states['state_name'].iteritems():
    
    state = process_state(s)
    us_state_dict[s] = state

ny = us_state_dict['New York']


In [None]:
ny.reset_index(inplace=True)

In [None]:
#patch erroneous date

ny.at[3,'date'] = pd.to_datetime('2020-03-13').date()
ny = ny.set_index('date')
us_state_dict['New York'] = ny

us_state_dict['New York']

In [None]:
us_state_dict['New York']

import pickle

with open ('us_state_dict.pkl','wb') as f:
    pickle.dump(us_state_dict,f,pickle.HIGHEST_PROTOCOL)
    
