In [3]:
import pandas as pd
import numpy as np
from datetime import *
from glob import glob

In [4]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

2.4.0


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

In [6]:
import pylab as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Get data

In [7]:
with open('pipeline_data/steps.dat', 'r') as handle:
    steps = handle.readline().strip().split()

In [8]:
csv_fname = glob('pipeline_data/data_????-??-??_glm.csv')[0]
df = pd.read_csv(csv_fname, dtype={'fips': str}).set_index('fips')
df.head()

Unnamed: 0_level_0,case0,case1,case2,case3,case4,case5,case6,case7,case8,case9,...,death29_glm,death30_glm,death31_glm,death32_glm,death33_glm,death34_glm,death35_glm,death36_glm,death37_glm,total_death_glm
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,12.0,7.0,6.0,11.0,11.0,16.0,45.0,48.0,60.0,48.0,...,1.89,1.92,2.58,2.54,3.32,2.95,4.44,4.85,5.0,69.38
1003,34.0,36.0,40.0,45.0,30.0,34.0,42.0,19.0,12.0,29.0,...,3.32,3.21,3.53,4.09,5.28,5.24,7.8,8.73,9.04,95.68
1005,3.0,7.0,8.0,16.0,9.0,14.0,24.0,27.0,47.0,41.0,...,0.7,0.57,1.06,0.67,0.86,0.66,0.98,1.02,1.2,47.25
1007,4.0,9.0,13.0,8.0,7.0,4.0,7.0,7.0,15.0,5.0,...,0.79,0.84,1.29,1.05,1.33,1.13,1.67,1.77,1.9,27.49
1009,12.0,3.0,9.0,12.0,6.0,5.0,2.0,3.0,13.0,11.0,...,0.8,0.9,1.08,1.08,1.22,1.17,1.64,1.77,1.83,14.24


## Functions

In [9]:
class tfRegr:
    
    def __init__(self, 
                 epoch=100, 
                 verbose=False, 
                 validation_split=0.2, 
                 learning_rate=.1):
        
        self.normalizer = preprocessing.Normalization()
        self.ep = epoch
        self.vb = verbose
        self.vs = validation_split
        self.lr = learning_rate
    
    def fit(self, X, y):
        self.normalizer.adapt(X)

        self.model = tf.keras.Sequential([
            self.normalizer,
            layers.Dense(units=1)
        ])

        self.model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.lr),
            loss='mean_absolute_error'
        )

        self.history = self.model.fit(
            X, y,
            epochs=self.ep,
            verbose=self.vb, # logging
            validation_split=self.vs # Calculate validation results on a fraction of the training data
        )
        return self.model

In [10]:
def predict(df, cur_idx):

    regrs = {
        'RF': RandomForestRegressor(min_samples_split=2), 
        'ET': ExtraTreesRegressor(min_samples_split=2), 
        'TF': tfRegr(),
    }
    
    cur = f'case{cur_idx}'

    glm = [f'case{cur_idx - 1}_glm', f'case{cur_idx}_glm'] 
    cols = [f'case{cur_idx - 1}'] + glm
    X, y = df[cols].values, df[cur].values 
    
    
    for name, regr in regrs.items():
        regr = regr.fit(X, y)

        #================ Evaluation ===============START
        y_pred = regr.predict(X).flatten()
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(((y - y_pred) ** 2).mean())
        print(f'\t{name}: r2 = {r2:.3f}, rmse = {rmse:.3f}')
        #================ Evaluation ===============START
        
        cols = [cur] + glm
        df[cur + f'_{name}_1'] = regr.predict(df[cols].values).flatten()
        
        cols = [cur + f'_{name}_1'] + glm
        df[cur + f'_{name}_2'] = regr.predict(df[cols].values).flatten()
    
    prd_cols_1 = [col for col in df.columns if col.endswith('_1')]
    prd_cols_2 = [col for col in df.columns if col.endswith('_2')]
    prd_1 = df[prd_cols_1].mean(axis=1).values
    prd_2 = df[prd_cols_2].mean(axis=1).values
    
    df.drop(prd_cols_1 + prd_cols_2, axis=1, inplace=True)
    
    return prd_1, prd_2

## Forecast runs

In [11]:
cur_idx = len(steps) - 1
print(steps[cur_idx])

num_runs = 100
prds_1, prds_2 = [], []

for r in range(num_runs):
    print(f'run = {r}')
    prd_1, prd_2 = predict(df, cur_idx)
    prds_1.append(prd_1)
    prds_2.append(prd_2)

2020-12-19
run = 0
	RF: r2 = 0.944, rmse = 540.013
	ET: r2 = 1.000, rmse = 0.066
	TF: r2 = 0.812, rmse = 990.920
run = 1
	RF: r2 = 0.953, rmse = 493.169
	ET: r2 = 1.000, rmse = 0.060
	TF: r2 = 0.810, rmse = 995.828
run = 2
	RF: r2 = 0.959, rmse = 462.555
	ET: r2 = 1.000, rmse = 0.065
	TF: r2 = 0.804, rmse = 1012.631
run = 3
	RF: r2 = 0.943, rmse = 545.392
	ET: r2 = 1.000, rmse = 0.069
	TF: r2 = 0.809, rmse = 1000.111
run = 4
	RF: r2 = 0.926, rmse = 623.520
	ET: r2 = 1.000, rmse = 0.018
	TF: r2 = 0.810, rmse = 997.121
run = 5
	RF: r2 = 0.934, rmse = 587.351
	ET: r2 = 1.000, rmse = 0.056
	TF: r2 = 0.811, rmse = 994.791
run = 6
	RF: r2 = 0.956, rmse = 480.787
	ET: r2 = 1.000, rmse = 0.017
	TF: r2 = 0.810, rmse = 995.606
run = 7
	RF: r2 = 0.952, rmse = 501.723
	ET: r2 = 1.000, rmse = 0.061
	TF: r2 = 0.812, rmse = 992.228
run = 8
	RF: r2 = 0.955, rmse = 485.698
	ET: r2 = 1.000, rmse = 0.051
	TF: r2 = 0.811, rmse = 993.212
run = 9
	RF: r2 = 0.934, rmse = 588.000
	ET: r2 = 1.000, rmse = 0.034

	ET: r2 = 1.000, rmse = 0.038
	TF: r2 = 0.805, rmse = 1010.053
run = 80
	RF: r2 = 0.965, rmse = 426.013
	ET: r2 = 1.000, rmse = 0.100
	TF: r2 = 0.810, rmse = 997.368
run = 81
	RF: r2 = 0.951, rmse = 506.725
	ET: r2 = 1.000, rmse = 0.053
	TF: r2 = 0.813, rmse = 989.245
run = 82
	RF: r2 = 0.929, rmse = 609.810
	ET: r2 = 1.000, rmse = 0.025
	TF: r2 = 0.811, rmse = 993.825
run = 83
	RF: r2 = 0.969, rmse = 404.858
	ET: r2 = 1.000, rmse = 0.015
	TF: r2 = 0.809, rmse = 998.166
run = 84
	RF: r2 = 0.954, rmse = 493.005
	ET: r2 = 1.000, rmse = 0.075
	TF: r2 = 0.810, rmse = 996.486
run = 85
	RF: r2 = 0.922, rmse = 638.975
	ET: r2 = 1.000, rmse = 0.045
	TF: r2 = 0.808, rmse = 1001.018
run = 86
	RF: r2 = 0.954, rmse = 489.115
	ET: r2 = 1.000, rmse = 0.020
	TF: r2 = 0.812, rmse = 991.227
run = 87
	RF: r2 = 0.945, rmse = 537.078
	ET: r2 = 1.000, rmse = 0.070
	TF: r2 = 0.812, rmse = 992.165
run = 88
	RF: r2 = 0.931, rmse = 600.524
	ET: r2 = 1.000, rmse = 0.057
	TF: r2 = 0.811, rmse = 993.742
run = 89


## Prepare for submission

In [12]:
# target end dates
cur_dt = datetime.strptime(steps[cur_idx], '%Y-%m-%d')
fc_dt = (cur_dt + timedelta(days=1)).date()
nxt_dt_1 = (cur_dt + timedelta(days=7)).date()
nxt_dt_2 = (cur_dt + timedelta(days=14)).date()
nxt_dts = [nxt_dt_1, nxt_dt_2]

# quantiles
qs = [.025, .1, .25, .5, .75, .9, .975]

In [13]:
df_county_prd_1 = pd.DataFrame(data=np.array(prds_1).T, index=df.index, columns=range(num_runs))
df_county_prd_2 = pd.DataFrame(data=np.array(prds_2).T, index=df.index, columns=range(num_runs))
county_dfs = [df_county_prd_1, df_county_prd_2]

### County

In [14]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    csv_fname = f'results/{steps[cur_idx]}_county_case_{w}-wk_{num_runs}.csv'
    county_df.to_csv(csv_fname, float_format='%.2f')
    
    # ========================= Point esitmate =========================START
    df_point = county_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'fips': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = county_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['fips']).sort_values('fips')\
        .rename(columns={'fips': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_county = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
print(df_forecast_county.shape)
df_forecast_county.head()

(49504, 6)


Unnamed: 0,location,value,type,quantile,target,target_end_date
0,1001,408.078304,point,,1 wk ahead inc case,2020-12-26
1,1003,862.015414,point,,1 wk ahead inc case,2020-12-26
2,1005,71.675797,point,,1 wk ahead inc case,2020-12-26
3,1007,167.599239,point,,1 wk ahead inc case,2020-12-26
4,1009,314.327388,point,,1 wk ahead inc case,2020-12-26


### State

In [15]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    df_tmp = county_df.reset_index()
    df_tmp['state'] = df_tmp['fips'].apply(lambda x: x[:2])
    state_df = df_tmp.groupby(by='state').sum()
    
    # ========================= Point esitmate =========================START
    df_point = state_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'state': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = state_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['state']).sort_values('state')\
        .rename(columns={'state': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_state = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_state.head()

Unnamed: 0,location,value,type,quantile,target,target_end_date
0,1,22682.906968,point,,1 wk ahead inc case,2020-12-26
1,2,2588.658755,point,,1 wk ahead inc case,2020-12-26
2,4,36449.680805,point,,1 wk ahead inc case,2020-12-26
3,5,13588.434289,point,,1 wk ahead inc case,2020-12-26
4,6,250274.664117,point,,1 wk ahead inc case,2020-12-26


### Nation

In [16]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    nation_df = pd.DataFrame(data=county_df.sum(axis=0), columns=['US']).T
    nation_df.index.name = 'nation'
    
    # ========================= Point esitmate =========================START
    df_point = nation_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'nation': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = nation_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['nation']).sort_values('nation')\
        .rename(columns={'nation': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_nation = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_nation

Unnamed: 0,location,value,type,quantile,target,target_end_date
0,US,1351637.0,point,,1 wk ahead inc case,2020-12-26
1,US,1347077.0,quantile,0.025,1 wk ahead inc case,2020-12-26
2,US,1348144.0,quantile,0.1,1 wk ahead inc case,2020-12-26
3,US,1350218.0,quantile,0.25,1 wk ahead inc case,2020-12-26
4,US,1351777.0,quantile,0.5,1 wk ahead inc case,2020-12-26
5,US,1353573.0,quantile,0.75,1 wk ahead inc case,2020-12-26
6,US,1354545.0,quantile,0.9,1 wk ahead inc case,2020-12-26
7,US,1355256.0,quantile,0.975,1 wk ahead inc case,2020-12-26
8,US,1320004.0,point,,2 wk ahead inc case,2021-01-02
9,US,1313266.0,quantile,0.025,2 wk ahead inc case,2021-01-02


In [17]:
df_forecast = pd.concat([df_forecast_county, df_forecast_state, df_forecast_nation])
df_forecast.reset_index(drop=True, inplace=True)
df_forecast['forecast_date'] = fc_dt
cols = ['forecast_date', 'target', 'target_end_date', 'location', 'type', 'quantile', 'value']
df_forecast = df_forecast[cols]

# =============== make nagative value zero ==============
value_dezero = df_forecast['value']
value_dezero[value_dezero < 0]  = 0
df_forecast['value'] = value_dezero

In [18]:
team = 'UChicagoCHATTOPADHYAY'
model = 'UnIT'
df_forecast.to_csv(f'results/{fc_dt}-{team}-{model}_case.csv', float_format='%.2f', index=False)