In [1]:
import pandas as pd
import numpy as np
from glob import glob

In [2]:
import json
from urllib.request import urlopen
import plotly.express as px
import plotly.figure_factory as ff

In [3]:
import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

2.1.0


In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import pylab as plt
%matplotlib inline

## Get data

In [6]:
with open('../forecast_pipeline/pipeline_data/steps.dat', 'r') as handle:
    steps = handle.readline().strip().split()

In [7]:
class tfRegr:
    
    def __init__(self, 
                 epoch=200, 
                 verbose=False, 
                 validation_split=0.2, 
                 learning_rate=.1):
        
        self.normalizer = preprocessing.Normalization()
        self.ep = epoch
        self.vb = verbose
        self.vs = validation_split
        self.lr = learning_rate
    
    def fit(self, X, y):
        self.normalizer.adapt(X)

        self.model = tf.keras.Sequential([
            self.normalizer,
            layers.Dense(units=1)
        ])

        self.model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.lr),
            loss='mean_absolute_error'
        )

        self.history = self.model.fit(
            X, y,
            epochs=self.ep,
            verbose=self.vb, # logging
            validation_split=self.vs # Calculate validation results on a fraction of the training data
        )
        return self.model

In [8]:
def predict(df, cur_idx):

    regrs = {
        'RF': RandomForestRegressor(min_samples_split=2), 
        'ET': ExtraTreesRegressor(min_samples_split=2), 
        # 'TF': tfRegr(),
    }
    
    cur = f'case{cur_idx}'

    glm = [f'case{cur_idx - 1}_glm', f'case{cur_idx}_glm'] 
    cols = [f'case{cur_idx - 1}'] + glm
    X, y = df[cols].values, df[cur].values 
    
    
    for name, regr in regrs.items():
        regr = regr.fit(X, y)

        #================ Evaluation ===============START
        y_pred = regr.predict(X).flatten()
        r2 = r2_score(y, y_pred)
        rmse = np.sqrt(((y - y_pred) ** 2).mean())
        print(f'\t{name}: r2 = {r2:.3f}, rmse = {rmse:.3f}')
        #================ Evaluation ===============START
        
        cols = [cur] + glm
        df[cur + f'_{name}_1'] = regr.predict(df[cols].values).flatten()
        
    
    prd_cols_1 = [col for col in df.columns if col.endswith('_1')]
    df[f'case{cur_idx + 1}_prd'] = df[prd_cols_1].mean(axis=1).values
    
    df.drop(prd_cols_1, axis=1, inplace=True)
    
    return

## Plot functions

In [33]:
def plot_(
    values, 
    fips, 
    state, 
    colorscale=colorscale, 
    binning_endpoints=endpts, 
    save_fname=None, width=900):
    
    """
    """
    
    fig = ff.create_choropleth(
        fips=fips, 
        values=values, 
        scope=[state], 
        show_state_data=True,
        colorscale=colorscale, 
        binning_endpoints=endpts, 
        round_legend_values=True,
        plot_bgcolor='rgb(255,255,255)',
        paper_bgcolor='rgb(255,255,255)',
        # legend_title='inc case',
        county_outline={'color': 'rgb(220,220,220)', 'width': 0.5},
    )

    fig.update_layout(
        margin={
            'l': 0, 
            'b': 0, 
            'r': 0, 
            't': 0,
            'autoexpand': False,
        }, 
        showlegend=False,
    )
    if save_fname is not None:
        fig.write_image(
            file=save_fname, 
            format='pdf', 
            width=width, 
            height=500)
        fig.layout.template = None
    fig.show()

## Forecast NY

In [34]:
csv_fname = 'results/case_full_NY.csv'
df = pd.read_csv(csv_fname, dtype={'fips': str}).set_index('fips')
indices = np.array([15, 20, 25, 30])

for i in indices - 1:
    print(steps[i])
    predict(df, i)
    
tmp = df[[f'case{i}' for i in indices] + [f'case{i}_prd' for i in indices]].copy()

2020-07-11
	RF: r2 = 0.993, rmse = 11.353
	ET: r2 = 1.000, rmse = 0.000
2020-08-15
	RF: r2 = 0.993, rmse = 12.533
	ET: r2 = 1.000, rmse = 0.000
2020-09-19
	RF: r2 = 0.973, rmse = 26.741
	ET: r2 = 1.000, rmse = 0.000
2020-10-24
	RF: r2 = 0.979, rmse = 40.828
	ET: r2 = 1.000, rmse = 0.000


In [35]:
vmin, vmax = tmp.min().min(), tmp.max().max()
endpts = list(np.mgrid[vmin : vmax : 10j])
print(endpts)

endpts = [0, 200, 400, 500, 600, 800, 1000, 1200, 1400, 1600]

colorscale = [
    "#fdc890","#fdbb84","#fca56c","#fc8d59",
    "#f67a50","#ef6548","#e34d33","#d7301f",
    "#c51d10","#b30000","#990001","#7f0000"
]

for i in indices:
    date = steps[i]
    print(date)
    
    save_fname_gnd = f'plots/NY_{date}_gnd.pdf'
    save_fname_prd = f'plots/NY_{date}_prd.pdf'
    # save_fname_gnd = None
    # save_fname_prd = None
    plot_(tmp[f'case{i}'].values, tmp.index.values, 'New York', save_fname=save_fname_gnd)
    plot_(tmp[f'case{i}_prd'].values, tmp.index.values, 'New York', save_fname=save_fname_prd)

[0.0, 184.66666666666666, 369.3333333333333, 554.0, 738.6666666666666, 923.3333333333333, 1108.0, 1292.6666666666665, 1477.3333333333333, 1662.0]
2020-07-18


2020-08-22


2020-09-26


2020-10-31


## Forecast CA

In [36]:
csv_fname = 'results/case_full_CA.csv'
df = pd.read_csv(csv_fname, dtype={'fips': str}).set_index('fips')
indices = np.array([15, 20, 25, 30])

for i in indices - 1:
    print(steps[i])
    predict(df, i)
tmp = df[[f'case{i}' for i in indices] + [f'case{i}_prd' for i in indices]].copy()

2020-07-11
	RF: r2 = 0.991, rmse = 294.440
	ET: r2 = 1.000, rmse = 0.000
2020-08-15
	RF: r2 = 0.959, rmse = 431.319
	ET: r2 = 1.000, rmse = 0.000
2020-09-19
	RF: r2 = 0.945, rmse = 218.048
	ET: r2 = 1.000, rmse = 0.000
2020-10-24
	RF: r2 = 0.964, rmse = 281.117
	ET: r2 = 1.000, rmse = 0.000


In [38]:
vmin, vmax = tmp.min().min(), tmp.max().max()
endpts = list(np.mgrid[vmin : vmax : 10j])
print(endpts)

endpts = [0, 2000, 4000, 6000, 8000, 10000, 12500, 15000, 17500, 20000]

colorscale = [
    "#fdc890","#fdbb84","#fca56c","#fc8d59",
    "#f67a50","#ef6548","#e34d33","#d7301f",
    "#c51d10","#b30000","#990001","#7f0000"
]

for i in indices:
    date = steps[i]
    print(date)
    
    save_fname_gnd = f'plots/CA_{date}_gnd.pdf'
    save_fname_prd = f'plots/CA_{date}_prd.pdf'
    # save_fname_gnd = None
    # save_fname_prd = None
    plot_(tmp[f'case{i}'].values, tmp.index.values, 'California', save_fname=save_fname_gnd, width=900)
    plot_(tmp[f'case{i}_prd'].values, tmp.index.values, 'California', save_fname=save_fname_prd, width=900)

[0.0, 2528.6666666666665, 5057.333333333333, 7586.0, 10114.666666666666, 12643.333333333332, 15172.0, 17700.666666666664, 20229.333333333332, 22758.0]
2020-07-18


2020-08-22


2020-09-26


2020-10-31
