In [1]:
import pandas as pd
import numpy as np
from datetime import *
from glob import glob

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

2.1.0


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

In [4]:
import pylab as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Get data

In [5]:
with open('pipeline_data/steps.dat', 'r') as handle:
    steps = handle.readline().strip().split()

2020-11-07 2020-11-14


In [6]:
csv_fname = glob('pipeline_data/data_*-*-*_glm.csv')[0]
df = pd.read_csv(csv_fname, dtype={'fips': str}).set_index('fips')
df.head()

Unnamed: 0_level_0,case0,case1,case2,case3,case4,case5,case6,case7,case8,case9,...,death21_glm,death22_glm,death23_glm,death24_glm,death25_glm,death26_glm,death27_glm,death28_glm,death29_glm,death30_glm
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,12.0,7.0,6.0,11.0,9.0,23.0,42.0,45.0,61.0,43.0,...,1.6,1.34,1.3,1.61,1.32,1.35,1.49,1.35,1.85,1.9
1003,29.0,37.0,43.0,38.0,34.0,35.0,38.0,19.0,10.0,21.0,...,2.95,2.52,2.18,2.64,2.37,2.13,2.23,2.61,3.23,3.17
1005,2.0,7.0,9.0,14.0,11.0,15.0,21.0,26.0,45.0,40.0,...,1.42,0.95,1.19,1.44,0.98,0.88,0.9,0.66,0.72,0.57
1007,4.0,9.0,13.0,8.0,8.0,3.0,5.0,8.0,14.0,5.0,...,0.76,0.58,0.65,0.8,0.61,0.66,0.76,0.58,0.78,0.83
1009,10.0,2.0,8.0,11.0,9.0,4.0,1.0,4.0,12.0,9.0,...,0.71,0.64,0.57,0.71,0.52,0.54,0.6,0.61,0.78,0.89


## Functions

In [7]:
class tfRegr:
    
    def __init__(self, 
                 epoch=100, 
                 verbose=False, 
                 validation_split=0.2, 
                 learning_rate=.1):
        
        self.normalizer = preprocessing.Normalization()
        self.ep = epoch
        self.vb = verbose
        self.vs = validation_split
        self.lr = learning_rate
    
    def fit(self, X, y):
        self.normalizer.adapt(X)

        self.model = tf.keras.Sequential([
            self.normalizer,
            layers.Dense(units=1)
        ])

        self.model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.lr),
            loss='mean_absolute_error'
        )

        self.history = self.model.fit(
            X, y,
            epochs=self.ep,
            verbose=self.vb, # logging
            validation_split=self.vs # Calculate validation results on a fraction of the training data
        )
        return self.model

In [36]:
def predict(df, cur_idx):

    regrs = {
        'RF': RandomForestRegressor(min_samples_split=2), 
        'ET': ExtraTreesRegressor(min_samples_split=2), 
        'TF': tfRegr(),
    }
    
    cur = f'case{cur_idx}'

    glm = [f'case{cur_idx - 1}_glm', f'case{cur_idx}_glm'] 
    cols = [f'case{cur_idx - 1}'] + glm
    X, y = df[cols].values, df[cur].values 
    
    
    for name, regr in regrs.items():
        regr = regr.fit(X, y)

        #================ Evaluation ===============START
        y_pred = regr.predict(X).flatten()
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(((y - y_pred) ** 2).mean())
        print(f'\t{name}: r2 = {r2:.3f}, rmse = {rmse:.3f}')
        #================ Evaluation ===============START
        
        cols = [cur] + glm
        df[cur + f'_{name}_1'] = regr.predict(df[cols].values).flatten()
        
        cols = [cur + f'_{name}_1'] + glm
        df[cur + f'_{name}_2'] = regr.predict(df[cols].values).flatten()
    
    prd_cols_1 = [col for col in df.columns if col.endswith('_1')]
    prd_cols_2 = [col for col in df.columns if col.endswith('_2')]
    prd_1 = df[prd_cols_1].mean(axis=1).values
    prd_2 = df[prd_cols_2].mean(axis=1).values
    
    df.drop(prd_cols_1 + prd_cols_2, axis=1, inplace=True)
    
    return prd_1, prd_2

## Forecast Runs

In [37]:
cur_idx = len(steps) - 1

num_runs = 100
prds_1, prds_2 = [], []

for r in range(num_runs):
    print(f'run = {r}')
    prd_1, prd_2 = predict(df, cur_idx)
    prds_1.append(prd_1)
    prds_2.append(prd_2)

run = 0
	RF: r2 = 0.948, rmse = 120.588
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.864, rmse = 195.400
run = 1
	RF: r2 = 0.933, rmse = 136.756
	ET: r2 = 1.000, rmse = 0.154
	TF: r2 = 0.865, rmse = 194.342
run = 2
	RF: r2 = 0.960, rmse = 105.706
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.865, rmse = 194.613
run = 3
	RF: r2 = 0.961, rmse = 104.694
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.864, rmse = 195.639
run = 4
	RF: r2 = 0.973, rmse = 86.758
	ET: r2 = 1.000, rmse = 0.045
	TF: r2 = 0.866, rmse = 194.059
run = 5
	RF: r2 = 0.955, rmse = 112.979
	ET: r2 = 1.000, rmse = 0.041
	TF: r2 = 0.865, rmse = 194.678
run = 6
	RF: r2 = 0.978, rmse = 78.543
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.865, rmse = 194.919
run = 7
	RF: r2 = 0.985, rmse = 64.791
	ET: r2 = 1.000, rmse = 0.193
	TF: r2 = 0.865, rmse = 194.688
run = 8
	RF: r2 = 0.951, rmse = 117.057
	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.866, rmse = 193.979
run = 9
	RF: r2 = 0.985, rmse = 65.196
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.865,

	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.864, rmse = 195.209
run = 81
	RF: r2 = 0.978, rmse = 77.926
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.863, rmse = 196.075
run = 82
	RF: r2 = 0.984, rmse = 66.159
	ET: r2 = 1.000, rmse = 0.150
	TF: r2 = 0.866, rmse = 193.726
run = 83
	RF: r2 = 0.967, rmse = 95.758
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.864, rmse = 195.451
run = 84
	RF: r2 = 0.956, rmse = 111.340
	ET: r2 = 1.000, rmse = 0.041
	TF: r2 = 0.865, rmse = 194.447
run = 85
	RF: r2 = 0.972, rmse = 89.001
	ET: r2 = 1.000, rmse = 0.052
	TF: r2 = 0.864, rmse = 195.512
run = 86
	RF: r2 = 0.951, rmse = 117.296
	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.865, rmse = 194.949
run = 87
	RF: r2 = 0.964, rmse = 101.126
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.866, rmse = 194.034
run = 88
	RF: r2 = 0.963, rmse = 101.364
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.865, rmse = 194.425
run = 89
	RF: r2 = 0.971, rmse = 90.544
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.865, rmse = 194.949
run = 90
	RF: r2

In [43]:
df_county_prd_1 = pd.DataFrame(data=np.array(prds_1).T, index=df.index, columns=range(num_runs))
df_county_prd_2 = pd.DataFrame(data=np.array(prds_2).T, index=df.index, columns=range(num_runs))

## Prepare for submission

In [None]:
cur_dt = datetime.strptime(steps[cur_idx], '%Y-%m-%d')
nxt_dt_1 = (cur_dt + timedelta(days=7)).date()
nxt_dt_2 = (cur_dt + timedelta(days=14)).date()
print(nxt_dt_1, nxt_dt_2)

qs = [.025, .1, .25, .5, .75, .9, .975]

### points

In [58]:
df_county_point_1 = df_county_prd_1.mean(axis=1).reset_index()\
    .rename(columns={0: 'value', 'fips': 'location'})
df_county_point_1['type'] = 'point'
df_county_point_1['quantile'] = 'NA'


df_county_quantile_1 = df_county_prd_1.quantile(qs, axis=1).T.reset_index()\
    .melt(id_vars=['fips']).sort_values('fips')\
    .rename(columns={'fips': 'location', 'variable': 'quantile'})
df_county_quantile_1['type'] = 'quantile'

df_forecast_1 = pd.concat

df_county_quantile_1['target'] = '1 wk ahead inc case'


df_county_point_2 = df_county_prd_2.mean(axis=1).reset_index()\
    .rename(columns={0: 'value', 'fips': 'location'})
df_county_point_2['type'] = 'point'
df_county_point_2['target'] = '2 wk ahead inc case'
df_county_point_2['quantile'] = 'NA'
df_county_point_2.head()

Unnamed: 0,location,value,type,target,quantile
0,1001,147.804935,point,2 wk ahead inc case,
1,1003,373.260547,point,2 wk ahead inc case,
2,1005,37.836025,point,2 wk ahead inc case,
3,1007,57.68843,point,2 wk ahead inc case,
4,1009,152.562126,point,2 wk ahead inc case,


### quantiles

In [59]:
qs = [.025, .1, .25, .5, .75, .9, .975]
df_county_quantile_1 = df_county_prd_1.quantile(qs, axis=1).T.reset_index()\
    .melt(id_vars=['fips']).sort_values('fips')\
    .rename(columns={'fips': 'location', 'variable': 'quantile'})
df_county_quantile_1['type'] = 'quantile'
df_county_quantile_1['target'] = '1 wk ahead inc case'

df_county_quantile_2 = df_county_prd_2.quantile(qs, axis=1).T.reset_index()\
    .melt(id_vars=['fips']).sort_values('fips')\
    .rename(columns={'fips': 'location', 'variable': 'quantile'})
df_county_quantile_2['type'] = 'quantile'
df_county_quantile_2['target'] = '2 wk ahead inc case'

In [64]:
df_forecast = pd.concat([
    df_county_point_1,
    df_county_point_2, 
    df_county_quantile_1, 
    df_county_quantile_2
]).reset_index(drop=True)

Unnamed: 0,location,quantile,target,type,value
0,1001,,1 wk ahead inc case,point,138.509799
1,1003,,1 wk ahead inc case,point,344.258966
2,1005,,1 wk ahead inc case,point,42.213962
3,1007,,1 wk ahead inc case,point,53.129535
4,1009,,1 wk ahead inc case,point,148.942071


In [27]:
cur_dt = datetime.strptime(steps[cur_idx], '%Y-%m-%d')
nxt_dt_1 = (cur_dt + timedelta(days=7)).date()
nxt_dt_2 = (cur_dt + timedelta(days=14)).date()
print(nxt_dt_1, nxt_dt_2)

2020-11-07 2020-11-14
