In [1]:
import pandas as pd
import numpy as np
from datetime import *
from glob import glob

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

2.1.0


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

In [4]:
import pylab as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Get data

In [5]:
with open('pipeline_data/steps.dat', 'r') as handle:
    steps = handle.readline().strip().split()

In [6]:
csv_fname = glob('pipeline_data/data_????-??-??_glm.csv')[0]
df = pd.read_csv(csv_fname, dtype={'fips': str}).set_index('fips')
df.head()

Unnamed: 0_level_0,case0,case1,case2,case3,case4,case5,case6,case7,case8,case9,...,death22_glm,death23_glm,death24_glm,death25_glm,death26_glm,death27_glm,death28_glm,death29_glm,death30_glm,death31_glm
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,12.0,7.0,6.0,11.0,9.0,23.0,42.0,45.0,61.0,43.0,...,1.37,1.31,1.62,1.34,1.38,1.51,1.36,1.86,1.91,2.11
1003,29.0,37.0,43.0,38.0,34.0,35.0,38.0,19.0,10.0,21.0,...,2.56,2.19,2.64,2.41,2.15,2.26,2.62,3.22,3.18,3.3
1005,2.0,7.0,9.0,14.0,11.0,15.0,21.0,26.0,45.0,40.0,...,0.96,1.19,1.45,1.01,0.89,0.91,0.67,0.74,0.58,0.64
1007,4.0,9.0,13.0,8.0,8.0,3.0,5.0,8.0,14.0,5.0,...,0.59,0.65,0.8,0.62,0.67,0.76,0.6,0.79,0.84,0.97
1009,10.0,2.0,8.0,11.0,9.0,4.0,1.0,4.0,12.0,9.0,...,0.65,0.58,0.71,0.53,0.55,0.61,0.62,0.79,0.89,0.91


## Functions

In [7]:
class tfRegr:
    
    def __init__(self, 
                 epoch=100, 
                 verbose=False, 
                 validation_split=0.2, 
                 learning_rate=.1):
        
        self.normalizer = preprocessing.Normalization()
        self.ep = epoch
        self.vb = verbose
        self.vs = validation_split
        self.lr = learning_rate
    
    def fit(self, X, y):
        self.normalizer.adapt(X)

        self.model = tf.keras.Sequential([
            self.normalizer,
            layers.Dense(units=1)
        ])

        self.model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.lr),
            loss='mean_absolute_error'
        )

        self.history = self.model.fit(
            X, y,
            epochs=self.ep,
            verbose=self.vb, # logging
            validation_split=self.vs # Calculate validation results on a fraction of the training data
        )
        return self.model

In [8]:
def predict(df, cur_idx):

    regrs = {
        'RF': RandomForestRegressor(min_samples_split=2), 
        'ET': ExtraTreesRegressor(min_samples_split=2), 
        'TF': tfRegr(),
    }
    
    cur = f'case{cur_idx}'

    glm = [f'case{cur_idx - 1}_glm', f'case{cur_idx}_glm'] 
    cols = [f'case{cur_idx - 1}'] + glm
    X, y = df[cols].values, df[cur].values 
    
    
    for name, regr in regrs.items():
        regr = regr.fit(X, y)

        #================ Evaluation ===============START
        y_pred = regr.predict(X).flatten()
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(((y - y_pred) ** 2).mean())
        print(f'\t{name}: r2 = {r2:.3f}, rmse = {rmse:.3f}')
        #================ Evaluation ===============START
        
        cols = [cur] + glm
        df[cur + f'_{name}_1'] = regr.predict(df[cols].values).flatten()
        
        cols = [cur + f'_{name}_1'] + glm
        df[cur + f'_{name}_2'] = regr.predict(df[cols].values).flatten()
    
    prd_cols_1 = [col for col in df.columns if col.endswith('_1')]
    prd_cols_2 = [col for col in df.columns if col.endswith('_2')]
    prd_1 = df[prd_cols_1].mean(axis=1).values
    prd_2 = df[prd_cols_2].mean(axis=1).values
    
    df.drop(prd_cols_1 + prd_cols_2, axis=1, inplace=True)
    
    return prd_1, prd_2

## Forecast Runs

In [9]:
cur_idx = len(steps) - 1
print(steps[cur_idx])

num_runs = 100
prds_1, prds_2 = [], []

for r in range(num_runs):
    print(f'run = {r}')
    prd_1, prd_2 = predict(df, cur_idx)
    prds_1.append(prd_1)
    prds_2.append(prd_2)

2020-11-07
run = 0
	RF: r2 = 0.965, rmse = 139.752
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.898, rmse = 238.537
run = 1
	RF: r2 = 0.968, rmse = 134.484
	ET: r2 = 1.000, rmse = 0.020
	TF: r2 = 0.900, rmse = 236.832
run = 2
	RF: r2 = 0.962, rmse = 145.754
	ET: r2 = 1.000, rmse = 0.020
	TF: r2 = 0.897, rmse = 239.487
run = 3
	RF: r2 = 0.971, rmse = 127.152
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.898, rmse = 238.441
run = 4
	RF: r2 = 0.953, rmse = 161.376
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.899, rmse = 237.884
run = 5
	RF: r2 = 0.977, rmse = 112.238
	ET: r2 = 1.000, rmse = 0.025
	TF: r2 = 0.899, rmse = 237.410
run = 6
	RF: r2 = 0.990, rmse = 76.379
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.898, rmse = 238.145
run = 7
	RF: r2 = 0.986, rmse = 87.362
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.898, rmse = 239.001
run = 8
	RF: r2 = 0.977, rmse = 112.794
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.899, rmse = 237.632
run = 9
	RF: r2 = 0.972, rmse = 125.898
	ET: r2 = 1.000, rmse = 0.000
	TF

	ET: r2 = 1.000, rmse = 0.199
	TF: r2 = 0.898, rmse = 238.837
run = 81
	RF: r2 = 0.984, rmse = 95.305
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.896, rmse = 241.022
run = 82
	RF: r2 = 0.960, rmse = 148.758
	ET: r2 = 1.000, rmse = 0.020
	TF: r2 = 0.899, rmse = 237.612
run = 83
	RF: r2 = 0.972, rmse = 125.562
	ET: r2 = 1.000, rmse = 0.025
	TF: r2 = 0.900, rmse = 236.439
run = 84
	RF: r2 = 0.981, rmse = 103.683
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.899, rmse = 237.362
run = 85
	RF: r2 = 0.969, rmse = 130.628
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.897, rmse = 240.263
run = 86
	RF: r2 = 0.987, rmse = 83.709
	ET: r2 = 1.000, rmse = 0.083
	TF: r2 = 0.899, rmse = 237.882
run = 87
	RF: r2 = 0.980, rmse = 106.131
	ET: r2 = 1.000, rmse = 0.005
	TF: r2 = 0.898, rmse = 238.343
run = 88
	RF: r2 = 0.965, rmse = 140.237
	ET: r2 = 1.000, rmse = 0.199
	TF: r2 = 0.899, rmse = 237.221
run = 89
	RF: r2 = 0.974, rmse = 120.060
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.899, rmse = 237.672
run = 90
	RF:

## Prepare for submission

In [10]:
# target end dates
cur_dt = datetime.strptime(steps[cur_idx], '%Y-%m-%d')
fc_dt = (cur_dt + timedelta(days=1)).date()
nxt_dt_1 = (cur_dt + timedelta(days=7)).date()
nxt_dt_2 = (cur_dt + timedelta(days=14)).date()
nxt_dts = [nxt_dt_1, nxt_dt_2]

# quantiles
qs = [.025, .1, .25, .5, .75, .9, .975]

In [11]:
df_county_prd_1 = pd.DataFrame(data=np.array(prds_1).T, index=df.index, columns=range(num_runs))
df_county_prd_2 = pd.DataFrame(data=np.array(prds_2).T, index=df.index, columns=range(num_runs))
county_dfs = [df_county_prd_1, df_county_prd_2]

### County

In [12]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    csv_fname = f'results/{steps[cur_idx]}_county_case_{w}-wk_{num_runs}.csv'
    county_df.to_csv(csv_fname, float_format='%.2f')
    
    # ========================= Point esitmate =========================START
    df_point = county_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'fips': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = county_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['fips']).sort_values('fips')\
        .rename(columns={'fips': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_county = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
print(df_forecast_county.shape)
df_forecast_county.head()

(49504, 6)


Unnamed: 0,location,quantile,type,value,target,target_end_date
0,1001,,point,126.150924,1 wk ahead inc case,2020-11-14
1,1003,,point,325.923057,1 wk ahead inc case,2020-11-14
2,1005,,point,31.12324,1 wk ahead inc case,2020-11-14
3,1007,,point,61.253172,1 wk ahead inc case,2020-11-14
4,1009,,point,148.793453,1 wk ahead inc case,2020-11-14


### States

In [13]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    df_tmp = county_df.reset_index()
    df_tmp['state'] = df_tmp['fips'].apply(lambda x: x[:2])
    state_df = df_tmp.groupby(by='state').sum()
    
    # ========================= Point esitmate =========================START
    df_point = state_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'state': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = state_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['state']).sort_values('state')\
        .rename(columns={'state': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_state = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_state.head()

Unnamed: 0,location,quantile,type,value,target,target_end_date
0,1,,point,11655.56844,1 wk ahead inc case,2020-11-14
1,2,,point,3077.701174,1 wk ahead inc case,2020-11-14
2,4,,point,13502.044363,1 wk ahead inc case,2020-11-14
3,5,,point,10490.733358,1 wk ahead inc case,2020-11-14
4,6,,point,50690.727653,1 wk ahead inc case,2020-11-14


### Nation

In [14]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    nation_df = pd.DataFrame(data=county_df.sum(axis=0), columns=['US']).T
    nation_df.index.name = 'nation'
    
    # ========================= Point esitmate =========================START
    df_point = nation_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'nation': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = nation_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['nation']).sort_values('nation')\
        .rename(columns={'nation': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_nation = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_nation

Unnamed: 0,location,quantile,type,value,target,target_end_date
0,US,,point,881712.0,1 wk ahead inc case,2020-11-14
1,US,0.025,quantile,874161.6,1 wk ahead inc case,2020-11-14
2,US,0.1,quantile,877508.0,1 wk ahead inc case,2020-11-14
3,US,0.25,quantile,879789.6,1 wk ahead inc case,2020-11-14
4,US,0.5,quantile,881277.3,1 wk ahead inc case,2020-11-14
5,US,0.75,quantile,884375.4,1 wk ahead inc case,2020-11-14
6,US,0.9,quantile,885473.3,1 wk ahead inc case,2020-11-14
7,US,0.975,quantile,888502.6,1 wk ahead inc case,2020-11-14
8,US,,point,1058848.0,2 wk ahead inc case,2020-11-21
9,US,0.025,quantile,1043082.0,2 wk ahead inc case,2020-11-21


In [15]:
df_forecast = pd.concat([df_forecast_county, df_forecast_state, df_forecast_nation])
df_forecast.reset_index(drop=True, inplace=True)
df_forecast['forecast_date'] = fc_dt
cols = ['forecast_date', 'target', 'target_end_date', 'location', 'type', 'quantile', 'value']
df_forecast = df_forecast[cols]

# =============== make nagative value zero ==============
value_dezero = df_forecast['value']
value_dezero[value_dezero < 0]  = 0
df_forecast['value'] = value_dezero

In [19]:
team = 'UChicagoCHATTOPADHYAY'
model = 'UnIT'
df_forecast.to_csv(f'results/{fc_dt}-{team}-{model}_case.csv', float_format='%.2f', index=False)

In [20]:
validation = '/home/yihuang/Documents/Data/covid19-forecast-hub/code/validation/validate_single_forecast_file.py'

! python3 {validation} results/2020-11-08-UChicagoCHATTOPADHYAY-UnIT_case.csv


VALIDATING results/2020-11-08-UChicagoCHATTOPADHYAY-UnIT_case.csv
✓ results/2020-11-08-UChicagoCHATTOPADHYAY-UnIT_case.csv is valid with no errors
