In [1]:
import pandas as pd
import numpy as np
from datetime import *
from glob import glob

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

2.1.0


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

In [4]:
import pylab as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Get data

In [5]:
with open('pipeline_data/steps.dat', 'r') as handle:
    steps = handle.readline().strip().split()

In [6]:
csv_fname = glob('pipeline_data/data_????-??-??_glm.csv')[0]
df = pd.read_csv(csv_fname, dtype={'fips': str}).set_index('fips')
df.head()

Unnamed: 0_level_0,case0,case1,case2,case3,case4,case5,case6,case7,case8,case9,...,death22_glm,death23_glm,death24_glm,death25_glm,death26_glm,death27_glm,death28_glm,death29_glm,death30_glm,death31_glm
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,12.0,7.0,6.0,11.0,9.0,23.0,42.0,45.0,61.0,43.0,...,1.37,1.31,1.62,1.34,1.38,1.51,1.36,1.86,1.91,2.11
1003,29.0,37.0,43.0,38.0,34.0,35.0,38.0,19.0,10.0,21.0,...,2.56,2.19,2.64,2.41,2.15,2.26,2.62,3.22,3.18,3.3
1005,2.0,7.0,9.0,14.0,11.0,15.0,21.0,26.0,45.0,40.0,...,0.96,1.19,1.45,1.01,0.89,0.91,0.67,0.74,0.58,0.64
1007,4.0,9.0,13.0,8.0,8.0,3.0,5.0,8.0,14.0,5.0,...,0.59,0.65,0.8,0.62,0.67,0.76,0.6,0.79,0.84,0.97
1009,10.0,2.0,8.0,11.0,9.0,4.0,1.0,4.0,12.0,9.0,...,0.65,0.58,0.71,0.53,0.55,0.61,0.62,0.79,0.89,0.91


## Functions

In [7]:
class tfRegr:
    
    def __init__(self, 
                 epoch=100, 
                 verbose=False, 
                 validation_split=0.2, 
                 learning_rate=.1):
        
        self.normalizer = preprocessing.Normalization()
        self.ep = epoch
        self.vb = verbose
        self.vs = validation_split
        self.lr = learning_rate
    
    def fit(self, X, y):
        self.normalizer.adapt(X)

        self.model = tf.keras.Sequential([
            self.normalizer,
            layers.Dense(units=1)
        ])

        self.model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.lr),
            loss='mean_absolute_error'
        )

        self.history = self.model.fit(
            X, y,
            epochs=self.ep,
            verbose=self.vb, # logging
            validation_split=self.vs # Calculate validation results on a fraction of the training data
        )
        return self.model

In [8]:
def get_prd(x, c=.5, gamma=0.000065):
    prd = (x['ET'] + x['RF'] + c * x['TF']) / (2 + c) 
    arr = prd.diff().fillna(0).values
    panel = np.array([np.exp(-gamma * a) for a in arr])
    return prd * panel

In [9]:
def predict(df, cur_idx):

    regrs = {
        'RF': RandomForestRegressor(min_samples_split=2), 
        'ET': ExtraTreesRegressor(min_samples_split=2), 
        'TF': tfRegr(),
    }

    glm = [
        f'death{cur_idx - 1}_glm', 
        f'death{cur_idx}_glm', 
        f'case{cur_idx - 1}_glm', 
        f'case{cur_idx}_glm'
    ] 
    cols = [f'death{cur_idx - 1}', f'case{cur_idx - 1}'] + glm
    X, y = df[cols].values, df[f'death{cur_idx}'].values 
    
    # ========================= get case training data ========================START
    cols = [f'case{cur_idx - 1}', f'case{cur_idx - 1}_glm', f'case{cur_idx}_glm']
    case_X, case_y = df[cols].values, df[f'case{cur_idx}'].values 
    # ========================= get case training data ========================END
    
    for name in regrs:
        if name == 'TF':
            case_regr = tfRegr()
            death_regr = tfRegr()
        elif name == 'RF':
            case_regr = RandomForestRegressor(min_samples_split=2)
            death_regr = RandomForestRegressor(min_samples_split=2)
        else:
            case_regr = ExtraTreesRegressor(min_samples_split=2)
            death_regr = ExtraTreesRegressor(min_samples_split=2)
        
        # ========================= get case prediction ========================START
        case_regr = case_regr.fit(case_X, case_y)
        cols = [f'case{cur_idx}', f'case{cur_idx - 1}_glm', f'case{cur_idx}_glm']
        df[f'case{cur_idx}_{name}_1'] = case_regr.predict(df[cols].values).flatten()
        # ========================= get case prediction ========================END
        
        death_regr = death_regr.fit(X, y)

        #================== Evaluation =================START
        y_pred = death_regr.predict(X).flatten()
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(((y - y_pred) ** 2).mean())
        print(f'\t{name}: r2 = {r2:.3f}, rmse = {rmse:.3f}')
        #================== Evaluation =================END
        
        cols = [f'death{cur_idx}', f'case{cur_idx}'] + glm
        df[f'death{cur_idx}_{name}_1'] = death_regr.predict(df[cols].values).flatten()
        
        cols = [f'death{cur_idx}_{name}_1', f'case{cur_idx}_{name}_1'] + glm
        df[f'death{cur_idx}_{name}_2'] = death_regr.predict(df[cols].values).flatten()
    
    prd_cols_1 = [col for col in df.columns if col.startswith('death') and col.endswith('_1')]
    prd_cols_2 = [col for col in df.columns if col.startswith('death') and col.endswith('_2')]
    
    c, gamma = .5, 0.0001
    tmp = df[prd_cols_1].rename(columns={f'death{cur_idx}_{name}_1': name for name in regrs})
    prd_1 = (tmp['ET'] + tmp['RF'] + c * tmp['TF']) / (2 + c) 
    tmp = df[prd_cols_2].rename(columns={f'death{cur_idx}_{name}_2': name for name in regrs})
    prd_2 = (tmp['ET'] + tmp['RF'] + c * tmp['TF']) / (2 + c)
    
    tmp = pd.concat([df[f'death{cur_idx}'], prd_1, prd_2], axis=1)
    panel = tmp.diff(axis=1).dropna(axis=1).applymap(lambda x: np.exp(-gamma * x))
    tmp = tmp[[0, 1]] * panel
    
    cols_to_drop = [col for col in df.columns if col.endswith('_1') or col.endswith('_2')]
    df.drop(cols_to_drop, axis=1, inplace=True)
    
    return tmp.values.T

## Forecast Runs

In [10]:
cur_idx = len(steps) - 1

num_runs = 100
prds_1, prds_2 = [], []

for r in range(num_runs):
    print(f'run = {r}')
    prd_1, prd_2 = predict(df, cur_idx)
    prds_1.append(prd_1)
    prds_2.append(prd_2)
    # print(prd_1.min(), prd_1.max(), prd_1.mean())
    # print(prd_2.min(), prd_2.max(), prd_2.mean())

run = 0
	RF: r2 = 0.937, rmse = 1.793
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.734, rmse = 3.681
run = 1
	RF: r2 = 0.865, rmse = 2.617
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.702, rmse = 3.894
run = 2
	RF: r2 = 0.961, rmse = 1.415
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.741, rmse = 3.630
run = 3
	RF: r2 = 0.970, rmse = 1.238
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.709, rmse = 3.848
run = 4
	RF: r2 = 0.910, rmse = 2.138
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.722, rmse = 3.764
run = 5
	RF: r2 = 0.922, rmse = 1.989
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.713, rmse = 3.822
run = 6
	RF: r2 = 0.876, rmse = 2.514
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.741, rmse = 3.633
run = 7
	RF: r2 = 0.950, rmse = 1.600
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.738, rmse = 3.655
run = 8
	RF: r2 = 0.784, rmse = 3.316
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.702, rmse = 3.897
run = 9
	RF: r2 = 0.898, rmse = 2.278
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.735, rmse = 3.673
run = 10
	RF: r2 = 0

	RF: r2 = 0.944, rmse = 1.692
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.736, rmse = 3.665
run = 84
	RF: r2 = 0.904, rmse = 2.208
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.678, rmse = 4.047
run = 85
	RF: r2 = 0.918, rmse = 2.042
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.739, rmse = 3.646
run = 86
	RF: r2 = 0.944, rmse = 1.690
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.714, rmse = 3.813
run = 87
	RF: r2 = 0.926, rmse = 1.943
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.748, rmse = 3.582
run = 88
	RF: r2 = 0.918, rmse = 2.044
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.723, rmse = 3.757
run = 89
	RF: r2 = 0.908, rmse = 2.165
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.712, rmse = 3.828
run = 90
	RF: r2 = 0.889, rmse = 2.382
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.743, rmse = 3.614
run = 91
	RF: r2 = 0.923, rmse = 1.976
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.744, rmse = 3.611
run = 92
	RF: r2 = 0.948, rmse = 1.634
	ET: r2 = 1.000, rmse = 0.000
	TF: r2 = 0.738, rmse = 3.649
run = 93
	RF: r2 = 

## Prepare for submission

In [11]:
# target end dates
cur_dt = datetime.strptime(steps[cur_idx], '%Y-%m-%d')
fc_dt = (cur_dt + timedelta(days=1)).date()
nxt_dt_1 = (cur_dt + timedelta(days=7)).date()
nxt_dt_2 = (cur_dt + timedelta(days=14)).date()
nxt_dts = [nxt_dt_1, nxt_dt_2]

# quantiles
qs = [.025, .1, .25, .5, .75, .9, .975]

In [12]:
data = np.array(prds_1).T
data[data < 0] = 0
df_county_prd_1 = pd.DataFrame(data=data, index=df.index, columns=range(num_runs))
data = np.array(prds_2).T
data[data < 0] = 0
df_county_prd_2 = pd.DataFrame(data=data, index=df.index, columns=range(num_runs))
county_dfs = [df_county_prd_1, df_county_prd_2]

### County

In [13]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    csv_fname = f'results/{steps[cur_idx]}_county_death_{w}-wk_{num_runs}.csv'
    county_df.to_csv(csv_fname, float_format='%.2f')
    
    # ========================= Point esitmate =========================START
    df_point = county_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'fips': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = county_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['fips']).sort_values('fips')\
        .rename(columns={'fips': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc death'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_county = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
print(df_forecast_county.shape)
df_forecast_county.head()

(49504, 6)


Unnamed: 0,location,quantile,type,value,target,target_end_date
0,1001,,point,0.312781,1 wk ahead inc death,2020-11-14
1,1003,,point,8.010899,1 wk ahead inc death,2020-11-14
2,1005,,point,0.382526,1 wk ahead inc death,2020-11-14
3,1007,,point,1.207907,1 wk ahead inc death,2020-11-14
4,1009,,point,1.735169,1 wk ahead inc death,2020-11-14


### States

In [14]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    df_tmp = county_df.reset_index()
    df_tmp['state'] = df_tmp['fips'].apply(lambda x: x[:2])
    state_df = df_tmp.groupby(by='state').sum()
    
    # ========================= Point esitmate =========================START
    df_point = state_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'state': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = state_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['state']).sort_values('state')\
        .rename(columns={'state': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc death'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_state = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_state.head()

Unnamed: 0,location,quantile,type,value,target,target_end_date
0,1,,point,139.959117,1 wk ahead inc death,2020-11-14
1,2,,point,15.608225,1 wk ahead inc death,2020-11-14
2,4,,point,167.534907,1 wk ahead inc death,2020-11-14
3,5,,point,138.100638,1 wk ahead inc death,2020-11-14
4,6,,point,431.610632,1 wk ahead inc death,2020-11-14


### Nation

In [15]:
forecast_dfs = []

for i, county_df in enumerate(county_dfs):
    w = i + 1
    
    nation_df = pd.DataFrame(data=county_df.sum(axis=0), columns=['US']).T
    nation_df.index.name = 'nation'
    
    # ========================= Point esitmate =========================START
    df_point = nation_df.mean(axis=1).reset_index()\
        .rename(columns={0: 'value', 'nation': 'location'})
    df_point['type'] = 'point'
    df_point['quantile'] = 'NA'
    # ========================= Point esitmate =========================END

    # ========================= Quantile esitmates =========================START
    df_quantile = nation_df.quantile(qs, axis=1).T.reset_index()\
        .melt(id_vars=['nation']).sort_values('nation')\
        .rename(columns={'nation': 'location', 'variable': 'quantile'})
    df_quantile['type'] = 'quantile'
    # ========================= Quantile esitmates =========================END

    df_forecast = pd.concat([df_point, df_quantile])
    df_forecast['target'] = f'{w} wk ahead inc death'
    df_forecast['target_end_date'] = nxt_dts[i]
    
    forecast_dfs.append(df_forecast)

df_forecast_nation = pd.concat(forecast_dfs, axis=0).reset_index(drop=True)
df_forecast_nation

Unnamed: 0,location,quantile,type,value,target,target_end_date
0,US,,point,8012.579201,1 wk ahead inc death,2020-11-14
1,US,0.025,quantile,7786.610184,1 wk ahead inc death,2020-11-14
2,US,0.1,quantile,7850.260807,1 wk ahead inc death,2020-11-14
3,US,0.25,quantile,7954.02385,1 wk ahead inc death,2020-11-14
4,US,0.5,quantile,8009.282339,1 wk ahead inc death,2020-11-14
5,US,0.75,quantile,8073.899276,1 wk ahead inc death,2020-11-14
6,US,0.9,quantile,8172.150616,1 wk ahead inc death,2020-11-14
7,US,0.975,quantile,8266.769929,1 wk ahead inc death,2020-11-14
8,US,,point,9691.885329,2 wk ahead inc death,2020-11-21
9,US,0.025,quantile,9280.635644,2 wk ahead inc death,2020-11-21


In [16]:
df_forecast = pd.concat([df_forecast_state, df_forecast_nation])
df_forecast.reset_index(drop=True, inplace=True)
df_forecast['forecast_date'] = fc_dt
cols = ['forecast_date', 'target', 'target_end_date', 'location', 'type', 'quantile', 'value']
df_forecast = df_forecast[cols]

# =============== make nagative value zero ==============
value_dezero = df_forecast['value']
value_dezero[value_dezero < 0]  = 0
df_forecast['value'] = value_dezero

In [30]:
team = 'UChicagoCHATTOPADHYAY'
model = 'UnIT'
df_forecast.to_csv(f'results/{fc_dt}-{team}-{model}_death.csv', float_format='%.2f', index=False)

## Validation

In [31]:
validation = '/home/yihuang/Documents/Data/covid19-forecast-hub/code/validation/validate_single_forecast_file.py'

! python3 {validation} results/2020-11-08-UChicagoCHATTOPADHYAY-UnIT_death.csv


VALIDATING results/2020-11-08-UChicagoCHATTOPADHYAY-UnIT_death.csv
✓ results/2020-11-08-UChicagoCHATTOPADHYAY-UnIT_death.csv is valid with no errors


In [33]:
case_csv = 'results/2020-11-08-UChicagoCHATTOPADHYAY-UnIT_case.csv'
df_forecast_case = pd.read_csv(case_csv, dtype={'location': str})

df_combined = pd.concat([df_forecast_case, df_forecast], axis=0)
combined_csv = 'submission/2020-11-08-UChicagoCHATTOPADHYAY-UnIT.csv'
df_combined.to_csv(combined_csv, index=False)

! python3 {validation} {combined_csv}


VALIDATING submission/2020-11-08-UChicagoCHATTOPADHYAY-UnIT.csv
✓ submission/2020-11-08-UChicagoCHATTOPADHYAY-UnIT.csv is valid with no errors
