To Integrate
- Change directories where commented '#Change Dir'
- Download new KFF State Data at 'https://www.kff.org/health-costs/issue-brief/state-data-and-policy-actions-to-address-coronavirus/' and drop and drop raw_data.csv to the corresponding directory (see State Actions section below)
- Run Script around 5 pm EST, assure output below Final Output section corresponds to today's date. Example:
**** COVID TRACKING DATE: 2020-04-21 00:00:00  ****

# Functions + Imports



In [None]:
import pandas as pd
import os
from os import path
import glob
import cProfile
import numpy as np
import requests, zipfile, io
#Change Dir
BOX_PATH = '/Users/tyler.poelkingibm.com/Box Sync/Mondelez: Demand forecasts during COVID-19/4. EDA & Descriptive analytics'


In [None]:
def grabstate(string):
    return states_daily[states_daily['state']==string]

def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
        display(df)

# The COVID Tracking Project
CovidActNow was created by a team of data scientists, engineers, and designers in partnership with epidemiologists, 
public health officials, and political leaders to help understand how the COVID-19 pandemic will affect 
their region

Source: https://covidtracking.com/

In [None]:
states_current = pd.read_csv('https://covidtracking.com/api/states.csv')
states_daily = pd.read_csv('https://covidtracking.com/api/states/daily.csv', parse_dates=['date'])
states_info = pd.read_csv('https://covidtracking.com/api/states/info.csv')
US_current = pd.read_csv('https://covidtracking.com/api/us.csv')
US_daily = pd.read_csv('https://covidtracking.com/api/us/daily.csv', parse_dates=['date'])

#Hospital beds per 1,000 people. American Hospital Association Annual Survey (2018).
beds  = pd.read_csv(f'{BOX_PATH}/Data/Static/State/hospital_beds.csv', header=2)
beds = beds[beds['Location']!='United States']
beds.columns = ['Location', 'Hospital_Beds_Per_1k']

In [None]:
#remove uneeded states
states_daily = states_daily[~states_daily['state'].isin(["AS","MP",'GU','PR','VI'])]
states_current = states_current[~states_current['state'].isin(["AS","MP",'GU','PR','VI'])].drop(columns=['notes','hash'], axis=1)


In [None]:
#Sort. Required.
states_daily.sort_values(['state','date'], inplace = True, ascending=[True, True])
US_daily.sort_values('date', inplace = True, ascending=True)

## Feature Generation

In [None]:
incr_cols = ['positive', 'negative','hospitalized','death']
stat_cols = ['positive', 'negative','pending','hospitalized','death']

### Logs (For Outliers)

In [None]:
for col in stat_cols:
    states_daily[f'{col}_log'] = np.log(states_daily[col]+1)

### Percent Changes

In [None]:
for col in incr_cols:
    states_daily.rename(columns={f'{col}Increase':f'daily {col}'}, inplace=True)
    US_daily.rename(columns={f'{col}Increase':f'daily {col}'}, inplace=True)
    
states_daily.rename(columns={'totalTestResultsIncrease':'daily total'}, inplace=True)
US_daily.rename(columns={'totalTestResultsIncrease':'daily total'}, inplace=True)

In [None]:
for col in stat_cols:
    states_daily[f'{col}_pct_chg'] = (states_daily.groupby('state')[col].apply(pd.Series.pct_change))
    US_daily[f'{col}_pct_chg'] = US_daily[col].pct_change()
    

### Rolling Percent Changes

In [None]:
windows = [int(3),int(5)]
for window in windows:
    for col in stat_cols:
        states_daily[f'{col}_{window}_day_avg_pct_chg'] = states_daily.groupby('state')[f'{col}_pct_chg'].rolling(window).mean().reset_index(0,drop=True)
        US_daily[f'{col}_{window}_day_avg_pct_chg'] = US_daily[f'{col}_pct_chg'].rolling(window).mean()
        

### Days since first instance

In [None]:
tmp = states_daily.copy()
tmp=tmp[tmp.groupby('state')['positive'].cumsum().gt(0)]
tmp['days_since_first_pos'] = tmp.groupby('state').cumcount()
states_daily=states_daily.merge(tmp['days_since_first_pos'], how='left', right_index=True,left_index=True)


### Clean + Write

In [None]:
states_daily.replace([np.inf, -np.inf], np.nan, inplace=True)
US_daily.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
states_cols_simple = ['date', 'state', 'positive', 'negative', 'pending', 'hospitalized',
       'death', 'dateChecked', 'daily positive',
       'daily negative','daily hospitalized',
       'daily death', 'positive_pct_chg',
       'negative_pct_chg', 'pending_pct_chg', 'hospitalized_pct_chg',
       'death_pct_chg']

US_cols_simple = ['date', 'states', 'positive', 'negative', 'pending', 'hospitalized',
       'death', 'daily positive',
       'daily negative', 'daily hospitalized',
       'daily death', 'positive_pct_chg',
       'negative_pct_chg', 'pending_pct_chg', 'hospitalized_pct_chg',
       'death_pct_chg']

In [None]:
states_current.to_csv(f'{BOX_PATH}/Data/Corona/COVID Tracking Project/states_current.csv', index=False)
states_daily[states_cols_simple].to_csv(f'{BOX_PATH}/Data/Corona/COVID Tracking Project/states_daily.csv', index=False)
states_daily.to_csv(f'{BOX_PATH}/Data/Corona/COVID Tracking Project/states_daily_all.csv', index=False)
states_info.to_csv(f'{BOX_PATH}/Data/Corona/COVID Tracking Project/states_info.csv', index=False)
US_current.to_csv(f'{BOX_PATH}/Data/Corona/COVID Tracking Project/US_current.csv', index=False)
US_daily[US_cols_simple].to_csv(f'{BOX_PATH}/Data/Corona/COVID Tracking Project/US_daily.csv', index=False)
US_daily.to_csv(f'{BOX_PATH}/Data/Corona/COVID Tracking Project/US_daily_all.csv', index=False)

# Static Data


## State level
Data USA puts public US Government data in your hands. Instead of searching through multiple data sources that are often incomplete and difficult to access, you can simply point to Data USA to answer your questions

Source: https://datausa.io/ 

In [None]:
box_dir = f'{BOX_PATH}/Data/State'

In [None]:
full_state_static = pd.DataFrame()
mini_state_static_list = []
for filename in os.listdir(box_dir):
    if filename.endswith(".csv"):
        print(f'Aggregating {filename}')
        min_state_static = pd.read_csv(f'{box_dir}/{filename}')
        min_state_static['Geography']=min_state_static['Geography'].str.strip()
        min_state_static.sort_values('Year', inplace=True, ascending = True)
        for col in [col for col in min_state_static.columns if col not in ['Year','Geography','ID Geography']]:
            min_state_static[col] = min_state_static.groupby('Geography')[col].fillna(method='ffill')
        min_state_static.drop_duplicates(subset=['Geography'], keep='last', inplace=True)
        assert ((min_state_static['Geography'].value_counts()>1).mean())==0
        min_state_static.set_index('Geography', inplace=True)
        mini_state_static_list.append(min_state_static)
        
        
full_state_static = pd.concat(mini_state_static_list, axis=1,copy=False).reset_index().rename(columns={'index':'Geography'})
full_state_static = full_state_static[~full_state_static['Geography'].isin(['Puerto Rico','American Samoa', 'Federated States of Micronesia','Marshall Islands', 'Commonwealth of the Northern Mariana Islands','Palau','United States Virgin Islands','Guam'])]
full_state_static = full_state_static.loc[:,~full_state_static.columns.duplicated()]

## Add state initial feature

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

full_state_static['State Initial'] = full_state_static['Geography'].replace(us_state_abbrev)

# Combine COVID data with State Data

In [None]:
states_daily = states_daily.merge(full_state_static, left_on='state', right_on='State Initial', how='left', validate='m:1')
states_daily = states_daily.merge(beds, left_on='Geography', right_on='Location', how='left', validate='m:1')

In [None]:
assert (states_daily['Geography'].nunique() ==51)

In [None]:
#Change Dir
states_daily.to_csv(f'{BOX_PATH}/Data/Combined/states_covid_daily_w_demographics.csv', index=False)

# State Actions

Source (State Actions to Mitigate the Spread of COVID-19): https://www.kff.org/health-costs/issue-brief/state-data-and-policy-actions-to-address-coronavirus/

In [None]:
#TODO: Add your directory where you wish to store the latest version of KFF State Actions

#This will error it a file was not uploaded/modified today. If so, grab file from KFF and upload in dir
date_modified = ! cd Data/State/Actions/new/ && GetFileInfo -d raw_data.csv
print(f'**** Latest KFF State Action File modified on {date_modified} ****')
assert(pd.Timestamp(date_modified[0]).floor("D") == pd.Timestamp.today().floor(freq='D'))

In [None]:
#Change Dir
box_dir = f'{BOX_PATH}/Data/State'
#TODO: Add your directory where you wish to store the latest version of KFF State Actions
#Change Dir
local_dir = 'Data/State/Actions/new'

#Load historical actions

ta_h = pd.read_csv(f'{box_dir}/Actions/historical/state_actions_historical.csv', parse_dates=['date'])
#Remove today if there (for repopulating)
ta_h = ta_h[ta_h['date'].dt.date != pd.Timestamp.today()]

#Load new actions
ta = pd.read_csv(f'{local_dir}/raw_data.csv',header=2)


#Process Data
ta.drop(['Footnotes','Primary Election Postponement'], axis=1,inplace=True)
ta.dropna(inplace=True)
ta.replace('-', 'None', inplace=True)
ta=ta[ta['Location']!='United States']
ta['date'] = pd.Timestamp.today().floor(freq='D')
ta.rename(columns={'Location':'Geography','School Closures': 'State-Mandated School Closures'},inplace=True)


#Sort columns and assert they are the same (checks for site changes to column names)
ta = ta.reindex(sorted(ta.columns), axis=1)
ta_h = ta_h.reindex(sorted(ta_h.columns), axis=1)
assert(len(ta)==51)
assert(all([i[0]==i[1] for i in list(zip(ta.columns,ta_h.columns))])), [i for i in list(zip(ta.columns,ta_h.columns))]


In [None]:
[i[0]==i[1] for i in list(zip(ta.columns,ta_h.columns))]

In [None]:
ta_h.columns

## Append new day's data to historical

In [None]:
#Action columns we will use to create calculated columns
#Add new as needed here
action_cols = ['State Is Easing Social Distancing Measures','Bar/Restaurant Limits',
 'Emergency Declaration',
 'Large Gatherings Ban',
 'Stay at Home Order',
'Mandatory Quarantine for Travelers',
 'Non-Essential Business Closures',
 'State-Mandated School Closures']

In [None]:
ta_total = pd.concat([ta_h,ta], axis=0)

for col in action_cols:
    ta_total[col] = ta_total[col].str.replace("*", "")

ta_total.sort_values(['Geography','date'], inplace=True)



## Calculate number of days each state has implimented each action

In [None]:
for col in action_cols:
    ta_total[f'days_on_{col}_status'] = ta_total.groupby(['Geography',col]).cumcount()+1
    ta_total.loc[ta_total[col].isna(), f'days_on_{col}_status']=np.nan

## New Action Indicated Column Given Threshold Number of Days Implimented

In [None]:
today = (pd.Timestamp.today().floor(freq='D'))
thresh = 1

In [None]:
#Add new as needed here
ta_total.loc[(ta_total['date']==today) & (
    (ta_total['days_on_State Is Easing Social Distancing Measures_status']==thresh)|
    (ta_total['days_on_Bar/Restaurant Limits_status']==thresh)|                                                                 
    (ta_total['days_on_Emergency Declaration_status']==thresh)|
    (ta_total['days_on_Large Gatherings Ban_status']==thresh)|
    (ta_total['days_on_Stay at Home Order_status']==thresh)|
    (ta_total['days_on_Mandatory Quarantine for Travelers_status']==thresh)|
    (ta_total['days_on_Non-Essential Business Closures_status']==thresh)|
    (ta_total['days_on_State-Mandated School Closures_status']==thresh)), 'new_state_action'] = 1

ta_total['new_state_action'].fillna(0, inplace=True)

## Individual Action Indicators

In [None]:
#Add new as needed herev
ta_total.loc[(ta_total['date']==today) &(ta_total['days_on_State Is Easing Social Distancing Measures_status']==1), 'new_state_action_easing_social_dist_measures'] = 1
ta_total.loc[(ta_total['date']==today) &(ta_total['days_on_Non-Essential Business Closures_status']==1), 'new_state_action_NEB_closures'] = 1
ta_total.loc[(ta_total['date']==today) &(ta_total['days_on_Bar/Restaurant Limits_status']==1), 'new_state_action_restaurant_limits'] = 1
ta_total.loc[(ta_total['date']==today) &(ta_total['days_on_Emergency Declaration_status']==1), 'new_state_action_emergency'] = 1
ta_total.loc[(ta_total['date']==today) &(ta_total['days_on_Stay at Home Order_status']==1), 'new_state_action_stay_at_home'] = 1

#Add new as needed here
ta_total['new_state_action_easing_social_dist_measures'].fillna(0, inplace=True)
ta_total['new_state_action_NEB_closures'].fillna(0, inplace=True)
ta_total['new_state_action_restaurant_limits'].fillna(0, inplace=True)
ta_total['new_state_action'].fillna(0, inplace=True)
ta_total['new_state_action_emergency'].fillna(0, inplace=True)
ta_total['new_state_action_stay_at_home'].fillna(0, inplace=True)

In [None]:
# Cumulative unique actions
#Add new as needed here
ta_total['easing_social_dist_measures_changecount'] = ta_total.groupby('Geography')['new_state_action_easing_social_dist_measures'].cumsum()
ta_total['neb_closures_changecount'] = ta_total.groupby('Geography')['new_state_action_NEB_closures'].cumsum()
ta_total['restaurant_limits_changecount'] = ta_total.groupby('Geography')['new_state_action_restaurant_limits'].cumsum()
ta_total['state_emergency_changecount'] = ta_total.groupby('Geography')['new_state_action_emergency'].cumsum()
ta_total['stay_at_home_changecount'] = ta_total.groupby('Geography')['new_state_action_stay_at_home'].cumsum()



In [None]:
#  ['State Is Easing Social Distancing Measures','Bar/Restaurant Limits',
#  'Emergency Declaration',
#  'Large Gatherings Ban',
#  'Stay at Home Order',
# 'Mandatory Quarantine for Travelers',
#  'Non-Essential Business Closures',
#  'State-Mandated School Closures']

## Merge with All Data

In [None]:
states_daily = states_daily.merge(ta_total, on=['date','Geography'], how='left', validate='1:1')

## Recent State Actions (Reformatted Table)

State	Date	Action 	Day Difference
(Most Recent Action)

In [None]:
test = ta_total.copy()
#Can Remove
test = test[~((test['State Is Easing Social Distancing Measures']==1)&(test['Stay at Home Order']=="None"))]

days_cols = [col for col in ta_total.columns if 'days_' in col]
first_action_day_tracker_sub = test[['Geography','date']+days_cols]
first_action_day_tracker = pd.wide_to_long(first_action_day_tracker_sub, stubnames = 'days_on',suffix='\\D+', i=["Geography",'date'], j='Recent Status Changed').reset_index()
first_action_day_tracker['Recent Status Changed'] = first_action_day_tracker['Recent Status Changed'].str.replace('_', ' ')
first_action_day_tracker_fin = first_action_day_tracker.merge(ta_total[['Geography','date','Bar/Restaurant Limits','Mandatory Quarantine for Travelers','Emergency Declaration','Large Gatherings Ban','Stay at Home Order','Non-Essential Business Closures','State-Mandated School Closures']], on=['Geography','date'], how='left')


In [None]:
first_action_day_tracker_fin['State'] = first_action_day_tracker_fin['Geography'].replace(us_state_abbrev)
first_action_day_tracker_fin[first_action_day_tracker_fin['days_on']==1].to_csv(f'{BOX_PATH}/Data/State/Actions/first_day_action_tracker.csv', index=False)


In [None]:
curr_date = ta_total.drop_duplicates(subset=['Geography'], keep='last')
days_cols = [col for col in ta_total.columns if 'days_' in col]
curr_date_sub = curr_date[['Geography']+days_cols]

In [None]:
l = pd.wide_to_long(curr_date_sub, stubnames = 'days_on',suffix='\\D+', i="Geography", j='Recent Status Changed').reset_index()
l['Recent Status Changed'] = l['Recent Status Changed'].str.replace('_', ' ')
l = l.merge(curr_date[['Geography','State Is Easing Social Distancing Measures','Bar/Restaurant Limits','Mandatory Quarantine for Travelers','Emergency Declaration','Large Gatherings Ban','Stay at Home Order','Non-Essential Business Closures','State-Mandated School Closures']], on='Geography', how='left')


In [None]:
def label_action (row):
    val = row['Recent Status Changed'].replace(' status','')
    val = val.strip()
    #print(val)
    return row[val]

l['Action']=l.apply(lambda row: label_action(row), axis=1)
l = l[['Geography', 'Recent Status Changed', 'days_on', 'Action']]
l.rename(columns={'days_on':'Days Since Change'},inplace=True)
l['State'] = l['Geography'].replace(us_state_abbrev)

In [None]:
def label_date (row):
    val = pd.Timestamp.today().floor(freq='D')-pd.Timedelta(days = (row['Days Since Change']-1))
    
    return val

l['Date Implemented On']=l.apply(lambda row: label_date(row), axis=1)

In [None]:
l.sort_values('Days Since Change',inplace=True)

In [None]:
#Filter to only rows where Days Since Change = mininum(Days Since Change) for each state 
#(accounts for if there are two changes in a day)

l = l[l['Days Since Change'] == l.groupby('Geography')['Days Since Change'].transform('min')]

In [None]:
#Filter on recent days only
l = l[l['Days Since Change']<=31]

In [None]:
l.sort_values('Days Since Change',inplace=True)#].value_counts()


# Projections

Source: http://www.healthdata.org/covid/data-downloads

## Download file from https://covid19.healthdata.org/projections as zip. Extract to Local


In [None]:
#Unzip
r = requests.get('https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip', stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(path='Data/Projections')


## Read Latest File

In [None]:
#Get earliest file in Projections
projections_dir = 'Data/Projections/*'
list_of_files = glob.glob(projections_dir)
new_proj_file = max(list_of_files, key=os.path.getctime) + '/*.csv'
proj_path = glob.glob(new_proj_file)
assert(len(proj_path)==1), 'MORE THAN ONE PROJECTIONS CSV'
PROJ_STATUS = f'READ IHME PROJECTIONS FILE AT: {proj_path[0]}'
    
proj_path = proj_path[0]
projections = pd.read_csv(proj_path)

# Initial Processing of Latest Projection File

In [None]:
#Processing
date_cols = [col for col in projections.columns if 'date' in col]
assert(len(date_cols)==1)
projections[date_cols[0]] = pd.to_datetime(projections[date_cols[0]], infer_datetime_format=True)

projections.rename(columns={'location_name':'Geography',date_cols[0]:'date'},inplace=True)

#filter to just geographies in states_daily
geos = set(states_daily['Geography'])
geos.add('United States of America')
projections=projections[projections['Geography'].isin(geos)]
assert(52 == len(set(projections['Geography'])))

In [None]:
#Join state static data 
og_len = len(projections)
projections = projections.merge(states_daily, on=['date','Geography'], how='left', validate='1:1')
projections.loc[projections['Geography']=='United States of America', 'Total Population']= 331002651

#Fill static date columns that did not have a corresponding match from COVID tracking data (unmatched dates)
for col in full_state_static.columns:
    projections[col] = projections.groupby('Geography')[col].fillna(method='ffill')
    projections[col] = projections.groupby('Geography')[col].fillna(method='bfill')

In [None]:
assert(~projections['Total Population'].hasnans)


In [None]:
#New calculated columns 

death_rate = 1
projections['new_pop_affected'] = projections['deaths_mean'] *100 / death_rate
projections['total_pop_affected'] = projections['totdea_mean'] *100 / death_rate

#Shift 14 days for lag time between getting COVID and dying
projections['total_pop_affected'] = projections.groupby('Geography')['total_pop_affected'].shift(-14)
projections['new_pop_affected'] = projections.groupby('Geography')['new_pop_affected'].shift(-14)

#Percent population affected
projections['perc_population_affected']=projections['total_pop_affected']/projections['Total Population']

#Threshold for lockdown openings based on 0.5% new cases growth rate WHO (https://www.aljazeera.com/news/2020/04/italy-remain-lockdown-3-200410232013521.html)
projections[f'affected_pct_chg'] = 100*(projections.groupby('Geography')['total_pop_affected'].apply(pd.Series.pct_change))
projections[f'affected_pct_chg'].replace([np.inf,-np.inf], 0, inplace=True)


In [None]:
assert(~projections['Total Population'].hasnans)

## Validate and Remap Geography

In [None]:
assert (og_len == len(projections))
projections['State'] = projections['Geography'].replace(us_state_abbrev)


# Bucketing States By Projections Data

In [None]:
projections = projections[projections['Geography']!='Life Care Center, Kirkland, WA']

peak_deaths = pd.DataFrame()
#.3 = 30%
thresh = .3

for geo in geos:

    # filter to one geo
    state_df = projections[projections['Geography']==geo]
    
    #get the max deaths/day stat
    max_deaths = state_df['deaths_mean'].max()

    #calculate threshold deaths_mean 
    thresh_percentile_death_rate = state_df[state_df['deaths_mean']>0]['deaths_mean'].quantile(thresh)

    #get date of peak. if multiple peaks get latest date one
    max_day_geo = state_df[state_df['deaths_mean']==max_deaths].drop_duplicates(subset=['Geography'], keep='last')
    max_day = max_day_geo['date'].values[0]

    #Start date
    #Get subset of state projections where deaths/day GREATER than or equal to threshold on the LEFT side of the 'bell curve' 
    start = state_df[(state_df['deaths_mean']>=thresh_percentile_death_rate)&(state_df['date']<max_day)]
    #calc max date of subset above
    start_day = min(start['date'])


    #End Date
    # Get subset of state projections where deaths/day GREATER than or equal to threshold on the RIGHT side of the 'bell curve'
    end = state_df[(state_df['deaths_mean']>=thresh_percentile_death_rate)&(state_df['date']>max_day)]
    #calc min date of subset above
    end_day = max(end['date'])
    
    #Lockdown removal threshold
    lockdown_remove = state_df[(state_df['affected_pct_chg']<0.1)&(state_df['date']>max_day)]
    lockdown_remove_day = min(lockdown_remove['date'])
    

    #get cumulative deaths at that point
    end_death_cum = state_df[state_df['date']==end_day]['totdea_mean'].values[0]

    #Add columns
    start_day = pd.Timestamp(start_day)
    end_day = pd.Timestamp(end_day)
    max_day_geo['start_day'] = start_day
    max_day_geo['end_day'] = end_day
    max_day_geo['lockdown_removal_day'] = lockdown_remove_day
    max_day_geo['end_totdea_mean'] = end_death_cum
    max_day_geo['thresh_perc_dea'] = thresh_percentile_death_rate

    #Append
    peak_deaths = peak_deaths.append(max_day_geo, ignore_index=True )

In [None]:
peak_deaths.rename(columns={'date':'peak_deaths_date','deaths_mean':'peak_deaths_mean','totdea_mean':'peak_totdea_mean'},inplace=True)
peak_deaths = peak_deaths[['Geography','start_day','peak_deaths_date','peak_deaths_mean','peak_totdea_mean','end_day','end_totdea_mean','thresh_perc_dea','lockdown_removal_day','Total Population']]

#Merge other data to peak deaths
peak_deaths = peak_deaths.merge(full_state_static[['Geography','State Initial']], on=['Geography'], how='left')
peak_deaths.rename(columns={'State Initial':'state'},inplace=True)

In [None]:
#Calculated Fields
peak_deaths['peak_affected_date'] = peak_deaths['peak_deaths_date']-pd.Timedelta(days=14)
peak_deaths['perc_deaths_at_peak'] = np.round((100*(peak_deaths['peak_totdea_mean']/peak_deaths['Total Population'])),2)
peak_deaths['perc_deaths_at_end']=np.round((100*(peak_deaths['end_totdea_mean']/peak_deaths['Total Population'])),2)
peak_deaths['days_start_to_end']=(peak_deaths['end_day']-peak_deaths['start_day']).dt.days
#Add deaths per million (end_totdea_mean/(total population/1,000,000))
peak_deaths['total_deaths_at_end_per_million'] = peak_deaths['end_totdea_mean']/(peak_deaths['Total Population']/1000000)
peak_deaths['days_until_peak']=(peak_deaths['peak_deaths_date']-pd.Timestamp.today()).dt.days
peak_deaths['days_until_end']=(peak_deaths['end_day']-pd.Timestamp.today()).dt.days

In [None]:
peak_deaths.to_csv(f'{BOX_PATH}/Data/Projections/peak_deaths.csv',index=False)

In [None]:
peak_deaths['days_start_to_end'].hist()

In [None]:
#TODO: Solidify
peak_deaths.loc[(peak_deaths['perc_deaths_at_end']>.03)&(peak_deaths['days_start_to_end']<=60),'state_segment'] = 1
peak_deaths.loc[(peak_deaths['perc_deaths_at_end']>.02)&(peak_deaths['state_segment']!=1)&(peak_deaths['days_start_to_end']<=84),'state_segment'] = 2

peak_deaths.loc[peak_deaths['peak_deaths_mean']<=10,'state_segment'] = 4
peak_deaths['state_segment'].fillna(3, inplace=True)
peak_deaths = peak_deaths[(peak_deaths['days_start_to_end'].notna())&(peak_deaths['perc_deaths_at_end'].notna())]

In [None]:
#Add segments to states_daily
og_len = len(states_daily)
states_daily = states_daily.merge(peak_deaths[['Geography','state_segment','days_start_to_end','end_totdea_mean','total_deaths_at_end_per_million']],on='Geography',how='left',validate='m:1')
assert(og_len==len(states_daily))

In [None]:
#Add segments to projections
og_len = len(projections)
projections = projections.merge(peak_deaths[['Geography','state_segment','start_day','end_day','days_start_to_end','end_totdea_mean','total_deaths_at_end_per_million','thresh_perc_dea','peak_deaths_mean','peak_deaths_date','peak_affected_date']],on='Geography',how='left',validate='m:1')
assert(og_len==len(projections))
projections['peak_deaths_date'] = projections.groupby('Geography')['peak_deaths_date'].fillna(method='ffill')
projections['peak_deaths_date'] = projections.groupby('Geography')['peak_deaths_date'].fillna(method='bfill')
projections['days_until_peak']=(projections['peak_deaths_date']-projections['date']).dt.days


#KEEP Results Checks
#print(peak_deaths['state_segment'].value_counts())
#print(peak_deaths[peak_deaths['Geography']!='Maine'].groupby('state_segment')['perc_deaths_at_end'].mean())
#peak_deaths[['state_segment','Geography','days_start_to_end','perc_deaths_at_end','totdea_mean']].sort_values(['state_segment','totdea_mean'], ascending = [True,False])

# Final Checks


In [None]:
states_daily['date'].max().floor(freq='D')


In [None]:
assert states_daily['Geography'].nunique()==51

#Check COVID Tracking updated with todays? date (will only work past 4pm based on site)
print(PROJ_STATUS)
print('**** COVID TRACKING DATE:', states_daily['date'].max().floor(freq='D'), ' ****')
#assert((pd.Timestamp.today()).floor("D") == states_daily['date'].max().floor(freq='D'))


# Experiment: KMeans For Clustering

from sklearn.cluster import KMeans

health_cols = ['days_start_to_end','days_until_peak','perc_deaths_at_end']

X = peak_deaths[health_cols]

#X.dropna(how='any',inplace=True)
print(len(X))
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X= scaler.fit_transform(X)

X

wcss = []
for i in range(1, 20):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 20), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(X)

# Write Backups To Box

In [None]:
#Change Dir
ta_total.to_csv(f'{BOX_PATH}/Data/backups/state_actions_bu_{str(pd.Timestamp.today()).replace("-","_")}.csv')
states_daily.to_csv(f'{BOX_PATH}/Data/backups/states_daily_bu_{str(pd.Timestamp.today()).replace("-","_")}.csv')

# Final Write All Data

## Final processing


In [None]:
states_daily.drop(['Year','ID Geography','hash','fips','Location','state'], axis=1,inplace=True)

In [None]:
states_daily.rename(columns={'State Initial': 'State'}, inplace=True)

In [None]:
# get a list of columns
cols = list(states_daily)

In [None]:
# move the column to head of list using index, pop and insert
for col in ['date','Geography', 'State']:
    cols.insert(0, cols.pop(cols.index(col)))
    

In [None]:
# use ix to reorder
states_daily = states_daily.reindex(columns =cols)

## Final write

In [None]:
#Main
#Change Dir
states_daily.to_csv(f'{BOX_PATH}/Data/Combined/states_all.csv', index=False)

#Actions
#Change Dir
ta_total.to_csv(f'{box_dir}/Actions/historical/state_actions_historical.csv', index=False)
l.to_csv(f'{box_dir}/Actions/current/recent_actions.csv', index=False)

#Projections
#Change Dir
projections.to_csv(f'{BOX_PATH}/Data/Projections/projections_state.csv', index=False)

#Demographics
#Change Dir
full_state_static.to_csv(f'{BOX_PATH}/Data/Static/State/states_2018_full.csv', index=False)

In [None]:
assert(False)

In [None]:
l#.to_csv(f'{box_dir}/Actions/current/recent_actions_test.csv', index=False)