In [1]:
import pandas as pd
import numpy as np
from datetime import *
from glob import glob

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

2.1.0


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

In [4]:
import pylab as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Get data

In [5]:
with open('pipeline_data/steps.dat', 'r') as handle:
    steps = handle.readline().strip().split()

In [6]:
csv_fname = glob('pipeline_data/data_????-??-??_glm.csv')[0]
df = pd.read_csv(csv_fname, dtype={'fips': str}).set_index('fips')
df.shape

(3094, 134)

## Functions

In [7]:
class tfRegr:
    
    def __init__(self, 
                 epoch=100, 
                 verbose=False, 
                 validation_split=0.2, 
                 learning_rate=.1):
        
        self.normalizer = preprocessing.Normalization()
        self.ep = epoch
        self.vb = verbose
        self.vs = validation_split
        self.lr = learning_rate
    
    def fit(self, X, y):
        self.normalizer.adapt(X)

        self.model = tf.keras.Sequential([
            self.normalizer,
            layers.Dense(units=1)
        ])

        self.model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.lr),
            loss='mean_absolute_error'
        )

        self.history = self.model.fit(
            X, y,
            epochs=self.ep,
            verbose=self.vb, # logging
            validation_split=self.vs # Calculate validation results on a fraction of the training data
        )
        return self.model

In [8]:
def predict(df, cur_idx):

    regrs = {
        'RF': RandomForestRegressor(min_samples_split=2), 
        'ET': ExtraTreesRegressor(min_samples_split=2), 
        'TF': tfRegr(),
    }
    
    cur = f'case{cur_idx}'

    glm = [f'case{cur_idx - 1}_glm', f'case{cur_idx}_glm'] 
    cols = [f'case{cur_idx - 1}'] + glm
    X, y = df[cols].values, df[cur].values 
    
    
    for name, regr in regrs.items():
        regr = regr.fit(X, y)

        #================ Evaluation ===============START
        y_pred = regr.predict(X).flatten()
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(((y - y_pred) ** 2).mean())
        print(f'\t{name}: r2 = {r2:.3f}, rmse = {rmse:.3f}')
        #================ Evaluation ===============START
        
        cols = [cur] + glm
        df[cur + f'_{name}_1'] = regr.predict(df[cols].values).flatten()
        
        cols = [cur + f'_{name}_1'] + glm
        df[cur + f'_{name}_2'] = regr.predict(df[cols].values).flatten()
    
    prd_cols_1 = [col for col in df.columns if col.endswith('_1')]
    prd_cols_2 = [col for col in df.columns if col.endswith('_2')]
    prd_1 = df[prd_cols_1].mean(axis=1).values
    prd_2 = df[prd_cols_2].mean(axis=1).values
    
    df.drop(prd_cols_1 + prd_cols_2, axis=1, inplace=True)
    
    return prd_1, prd_2

## Forecast Runs

In [9]:
cur_idx = len(steps) - 1

num_runs = 100
prds_1, prds_2 = [], []

for r in range(num_runs):
    print(f'run = {r}')
    prd_1, prd_2 = predict(df, cur_idx)
    prds_1.append(prd_1)
    prds_2.append(prd_2)

run = 0
	RF: r2 = 0.965, rmse = 99.047
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.865, rmse = 194.472
run = 1
	RF: r2 = 0.957, rmse = 109.414
	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.865, rmse = 194.981
run = 2
	RF: r2 = 0.963, rmse = 102.296
	ET: r2 = 1.000, rmse = 0.090
	TF: r2 = 0.865, rmse = 194.401
run = 3
	RF: r2 = 0.956, rmse = 111.680
	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.865, rmse = 194.451
run = 4
	RF: r2 = 0.976, rmse = 81.685
	ET: r2 = 1.000, rmse = 0.023
	TF: r2 = 0.865, rmse = 194.991
run = 5
	RF: r2 = 0.967, rmse = 95.817
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.864, rmse = 195.410
run = 6
	RF: r2 = 0.969, rmse = 93.436
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.863, rmse = 196.185
run = 7
	RF: r2 = 0.962, rmse = 103.785
	ET: r2 = 1.000, rmse = 0.158
	TF: r2 = 0.864, rmse = 195.453
run = 8
	RF: r2 = 0.959, rmse = 107.478
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.864, rmse = 195.582
run = 9
	RF: r2 = 0.979, rmse = 77.513
	ET: r2 = 1.000, rmse = 0.125
	TF: r2 = 0.865, 

	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.864, rmse = 195.400
run = 81
	RF: r2 = 0.978, rmse = 79.332
	ET: r2 = 1.000, rmse = 0.028
	TF: r2 = 0.865, rmse = 194.451
run = 82
	RF: r2 = 0.947, rmse = 122.295
	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.865, rmse = 194.459
run = 83
	RF: r2 = 0.957, rmse = 109.924
	ET: r2 = 1.000, rmse = 0.150
	TF: r2 = 0.865, rmse = 194.735
run = 84
	RF: r2 = 0.979, rmse = 76.840
	ET: r2 = 1.000, rmse = 0.026
	TF: r2 = 0.865, rmse = 194.776
run = 85
	RF: r2 = 0.933, rmse = 137.369
	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.865, rmse = 194.780
run = 86
	RF: r2 = 0.954, rmse = 114.041
	ET: r2 = 1.000, rmse = 0.150
	TF: r2 = 0.865, rmse = 194.647
run = 87
	RF: r2 = 0.952, rmse = 115.784
	ET: r2 = 1.000, rmse = 0.167
	TF: r2 = 0.866, rmse = 194.199
run = 88
	RF: r2 = 0.985, rmse = 63.819
	ET: r2 = 1.000, rmse = 0.035
	TF: r2 = 0.864, rmse = 195.304
run = 89
	RF: r2 = 0.964, rmse = 100.732
	ET: r2 = 1.000, rmse = 0.020
	TF: r2 = 0.866, rmse = 194.261
run = 90
	RF: 

## Prepare for submission

In [56]:
# target end dates
cur_dt = datetime.strptime(steps[cur_idx], '%Y-%m-%d')
fc_dt = (cur_dt + timedelta(days=1)).date()
nxt_dt_1 = (cur_dt + timedelta(days=7)).date()
nxt_dt_2 = (cur_dt + timedelta(days=14)).date()
nxt_dts = [nxt_dt_1, nxt_dt_2]

# quantiles
qs = [.025, .1, .25, .5, .75, .9, .975]

In [38]:
df_county_prd_1 = pd.DataFrame(data=np.array(prds_1).T, index=df.index, columns=range(num_runs))
df_county_prd_2 = pd.DataFrame(data=np.array(prds_2).T, index=df.index, columns=range(num_runs))
county_dfs = [df_county_prd_1, df_county_prd_2]

### County

In [65]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    csv_fname = f'results/{steps[cur_idx]}_county_case_{w}-wk_{num_runs}.csv'
    county_df.to_csv(csv_fname, float_format='%.2f')
    
    # ========================= Point esitmate =========================START
    df_point = county_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'fips': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = county_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['fips']).sort_values('fips')\
        .rename(columns={'fips': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_county = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
print(df_forecast_county.shape)
df_forecast_county.head()

(49504, 6)


Unnamed: 0,location,quantile,type,value,target,target_end_date
0,1001,,point,138.404722,1 wk ahead inc case,2020-11-07
1,1003,,point,345.367439,1 wk ahead inc case,2020-11-07
2,1005,,point,42.03079,1 wk ahead inc case,2020-11-07
3,1007,,point,53.405,1 wk ahead inc case,2020-11-07
4,1009,,point,150.410186,1 wk ahead inc case,2020-11-07


### States

In [63]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    df_tmp = county_df.reset_index()
    df_tmp['state'] = df_tmp['fips'].apply(lambda x: x[:2])
    state_df = df_tmp.groupby(by='state').sum()
    
    # ========================= Point esitmate =========================START
    df_point = state_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'state': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = state_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['state']).sort_values('state')\
        .rename(columns={'state': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_state = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_state.head()

Unnamed: 0,location,quantile,type,value,target,target_end_date
0,1,,point,9766.977898,1 wk ahead inc case,2020-11-07
1,2,,point,2980.016721,1 wk ahead inc case,2020-11-07
2,4,,point,11327.762787,1 wk ahead inc case,2020-11-07
3,5,,point,7339.597857,1 wk ahead inc case,2020-11-07
4,6,,point,33530.278308,1 wk ahead inc case,2020-11-07


### Nation

In [64]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    nation_df = pd.DataFrame(data=county_df.sum(axis=0), columns=['US']).T
    nation_df.index.name = 'nation'
    
    # ========================= Point esitmate =========================START
    df_point = nation_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'nation': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = nation_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['nation']).sort_values('nation')\
        .rename(columns={'nation': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc case'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_nation = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_nation

Unnamed: 0,location,quantile,type,value,target,target_end_date
0,US,,point,614148.643553,1 wk ahead inc case,2020-11-07
1,US,0.025,quantile,611249.453888,1 wk ahead inc case,2020-11-07
2,US,0.1,quantile,611774.960267,1 wk ahead inc case,2020-11-07
3,US,0.25,quantile,612501.644767,1 wk ahead inc case,2020-11-07
4,US,0.5,quantile,613862.373668,1 wk ahead inc case,2020-11-07
5,US,0.75,quantile,615489.870717,1 wk ahead inc case,2020-11-07
6,US,0.9,quantile,616773.537466,1 wk ahead inc case,2020-11-07
7,US,0.975,quantile,618720.082484,1 wk ahead inc case,2020-11-07
8,US,,point,681392.505976,2 wk ahead inc case,2020-11-14
9,US,0.025,quantile,674756.216134,2 wk ahead inc case,2020-11-14


In [66]:
df_forecast = pd.concat([df_forecast_county, df_forecast_state, df_forecast_nation])
df_forecast.reset_index(drop=True, inplace=True)
df_forecast['forecast_date'] = fc_dt
cols = ['forecast_date', 'target', 'target_end_date', 'location', 'type', 'quantile', 'value']
df_forecast = df_forecast[cols]

In [70]:
team = 'UChicagoCHATTOPADHYAY'
model = 'UnIT'
df_forecast.to_csv(f'submission/{fc_dt}-{team}-{model}.csv', float_format='%.2f')