In [1]:
import pandas as pd
from glob import glob
from subprocess import check_output
from datetime import date
import os
os.getcwd()

'/home/yihuang/Documents/CODE/Python_projects/unitcov/forecast_pipeline'

## Download and check covid data

In [2]:
def modify_date(date):
    m, d, y = list(map(int, date.split('/')))
    date_vec = [y, m, d]
    return '20' + '-'.join([str(x).zfill(2) for x in date_vec])

In [3]:
url_case = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
url_death = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'
df_case = pd.read_csv(url_case).dropna()
df_death = pd.read_csv(url_death).dropna()

print(f'JHU case data shape = {df_case.shape}')
print(f'JHU death data shape = {df_death.shape}')

JHU case data shape = (3324, 301)
JHU death data shape = (3324, 302)


In [4]:
dates_case = sorted([modify_date(col) for col in df_case.columns if col.endswith('/20')])
dates_death = sorted([modify_date(col) for col in df_death.columns if col.endswith('/20')])

In [5]:
# Date sanity check
date_check = (dates_case[0] == dates_death[0]) & (dates_case[-1] == dates_death[-1])
print(f'Date check passed? {date_check}')

Date check passed? True


In [6]:
# clean up and save
csv_case_fname = url_case.split('/')[-1][:-4] + '_' + dates_case[-1] + '.csv'
csv_death_fname = url_death.split('/')[-1][:-4] + '_' + dates_case[-1] + '.csv'

existing_case_csv = glob('pipeline_data/time_series_covid19_confirmed_US*csv')
existing_death_csv = glob('pipeline_data/time_series_covid19_death_US*csv')
if len(existing_case_csv) > 0:
    for csv in existing_case_csv:
        check_output(f'rm {csv}', shell=True)
if len(existing_death_csv) > 0:
    for csv in existing_death_csv:
        check_output(f'rm {csv}', shell=True)    

df_case.to_csv(f'pipeline_data/{csv_case_fname}', index=False)
df_death.to_csv(f'pipeline_data/{csv_death_fname}', index=False)

## Process covid data and make regression dataset

In [7]:
start = '2020-04-04'
end = date.today()
dates_ = pd.date_range(start, end, freq='7D')
print(dates_)

DatetimeIndex(['2020-04-04', '2020-04-11', '2020-04-18', '2020-04-25',
               '2020-05-02', '2020-05-09', '2020-05-16', '2020-05-23',
               '2020-05-30', '2020-06-06', '2020-06-13', '2020-06-20',
               '2020-06-27', '2020-07-04', '2020-07-11', '2020-07-18',
               '2020-07-25', '2020-08-01', '2020-08-08', '2020-08-15',
               '2020-08-22', '2020-08-29', '2020-09-05', '2020-09-12',
               '2020-09-19', '2020-09-26', '2020-10-03', '2020-10-10',
               '2020-10-17', '2020-10-24', '2020-10-31', '2020-11-07'],
              dtype='datetime64[ns]', freq='7D')


In [8]:
df_case.rename(columns={col: modify_date(col) for col in df_case.columns if col.endswith('/20')}, inplace=True)
df_death.rename(columns={col: modify_date(col) for col in df_death.columns if col.endswith('/20')}, inplace=True)

In [9]:
dates = sorted(list(set(df_case.columns.values).intersection([str(d.date()) for d in dates_])))

In [10]:
df_case['FIPS'] = df_case['FIPS'].apply(lambda x: str(int(x)).zfill(5))
df_case = df_case.rename(columns={'FIPS': 'fips'}).set_index('fips')
df_case = df_case.drop([
    'UID', 'iso2', 'iso3', 'code3',
    'Admin2', 'Province_State', 'Country_Region', 
    'Lat', 'Long_', 'Combined_Key'], axis=1)

df_case['2020-01-21'] = 0

In [11]:
df_death['FIPS'] = df_death['FIPS'].apply(lambda x: str(int(x)).zfill(5))
df_death = df_death.rename(columns={'FIPS': 'fips'}).set_index('fips')
df_death = df_death.drop([
    'UID', 'iso2', 'iso3', 'code3',
    'Admin2', 'Province_State', 'Country_Region', 
    'Lat', 'Long_', 'Combined_Key'], axis=1)

df_death['2020-01-21'] = 0

In [12]:
cols = ['2020-01-21'] + dates

df_case_step = df_case[cols]\
    .rename(columns={col: f'case{i - 1}' for i, col in enumerate(cols)})\
    .diff(axis=1).dropna(axis=1)
df_death_step = df_death[cols]\
    .rename(columns={col: f'death{i - 1}' for i, col in enumerate(cols)})\
    .diff(axis=1).dropna(axis=1)

df_covid = pd.concat([df_case_step, df_death_step], axis=1)
df_covid[df_covid < 0] = 0

with open('pipeline_data/steps.dat', 'w') as handle:
    handle.write(' '.join(cols[1:]))

In [13]:
# cleanup and save
existing_data_covid_csv = glob('pipeline_data/data_covid_????-??-??.csv')

if len(existing_data_covid_csv) > 0:
    for csv in existing_data_covid_csv:
        check_output(f'rm {csv}', shell=True)
        
df_covid.to_csv(f'pipeline_data/data_covid_{dates_case[-1]}.csv')

### load non-covid data and combine

In [14]:
df_non_covid = pd.read_csv('pipeline_data/data_non-covid.csv', dtype={'fips': str}).set_index('fips')
df = df_non_covid.join(df_covid)

In [15]:
# cleanup and save
existing_data_csv = glob('pipeline_data/data_????-??-??.csv')
if len(existing_data_csv) > 0:
    for csv in existing_data_csv:
        check_output(f'rm {csv}', shell=True)  

df.to_csv(f'pipeline_data/data_{dates_case[-1]}.csv')