In [32]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

# local import
from make_datasets import make_data
import models

import tensorflow as tf

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:

# Path to a geopandas datafile
data_path='./clean_annual_tract/'
data_gdf = gpd.read_file(data_path)

Process dataframe into a data frame with a Multiindex on location and time

In [15]:

# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


validation_year = 2019
first_test_year = 2020
last_test_year = 2021
first_test_timestep = 20
last_test_timestep = 21
lookback_years=5
first_train_eval_year = validation_year - lookback_years
last_train_eval_year = validation_year -1

In [16]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [17]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)

# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)

# For the weighted historical average model, we only use deaths as features
x_BSF_no_svi, y_BS_no_svi = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths', 'lat', 'lon', timestep_col], num_geoids)
x_test_BSF_no_svi, y_test_BS_no_svi =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths', 'lat', 'lon', timestep_col], num_geoids)

In [18]:
x_BSF.shape

TensorShape([5, 1620, 45])

In [19]:
y_BS.shape

TensorShape([5, 1620])

In [20]:
bpr_over_time, mae_over_time, rmse_over_time = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep,
                                        num_geoids, bpr_uncertainty_samples=50, removed_locations=250)

In [21]:
model_name = 'Zeroes'

bpr_samples_both_years = np.mean(bpr_over_time, axis=0)
mae_samples_both_years = np.mean(mae_over_time, axis=0)
rmse_samples_both_years = np.mean(rmse_over_time, axis=0)

                        
print(f"""{model_name} model BPR (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

print(f"""{model_name} model MAE (Mean, 95% CI): {np.mean(mae_samples_both_years):.2f},
      ({np.percentile(mae_samples_both_years,2.5):.2f}-
       {np.percentile(mae_samples_both_years,97.5):.2f})""")

print(f"""{model_name} model RMSE (Mean, 95% CI): {np.mean(rmse_samples_both_years):.2f},
      ({np.percentile(rmse_samples_both_years,2.5):.2f}-
       {np.percentile(rmse_samples_both_years,97.5):.2f})""")


Zeroes model BPR (Mean, 95% CI): 25.2,
      (24.8-
       25.4)
Zeroes model MAE (Mean, 95% CI): 1.24,
      (1.22-
       1.26)
Zeroes model RMSE (Mean, 95% CI): 1.91,
      (1.88-
       1.94)


In [22]:
bpr_over_time, mae_over_time, rmse_over_time = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1,bpr_uncertainty_samples=50,)

In [23]:
model_name = 'Last time'

bpr_samples_both_years = np.mean(bpr_over_time, axis=0)
mae_samples_both_years = np.mean(mae_over_time, axis=0)
rmse_samples_both_years = np.mean(rmse_over_time, axis=0)

                        
print(f"""{model_name} model BPR (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

print(f"""{model_name} model MAE (Mean, 95% CI): {np.mean(mae_samples_both_years):.2f},
      ({np.percentile(mae_samples_both_years,2.5):.2f}-
       {np.percentile(mae_samples_both_years,97.5):.2f})""")

print(f"""{model_name} model RMSE (Mean, 95% CI): {np.mean(rmse_samples_both_years):.2f},
      ({np.percentile(rmse_samples_both_years,2.5):.2f}-
       {np.percentile(rmse_samples_both_years,97.5):.2f})""")


Last time model BPR (Mean, 95% CI): 51.5,
      (49.6-
       53.7)
Last time model MAE (Mean, 95% CI): 1.08,
      (1.07-
       1.10)
Last time model RMSE (Mean, 95% CI): 1.57,
      (1.55-
       1.59)


In [24]:
bpr_over_time, mae_over_time, rmse_over_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 7, bpr_uncertainty_samples=50,)

In [25]:
model_name = 'Historical Average'

bpr_samples_both_years = np.mean(bpr_over_time, axis=0)
mae_samples_both_years = np.mean(mae_over_time, axis=0)
rmse_samples_both_years = np.mean(rmse_over_time, axis=0)

                        
print(f"""{model_name} model BPR (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

print(f"""{model_name} model MAE (Mean, 95% CI): {np.mean(mae_samples_both_years):.2f},
      ({np.percentile(mae_samples_both_years,2.5):.2f}-
       {np.percentile(mae_samples_both_years,97.5):.2f})""")

print(f"""{model_name} model RMSE (Mean, 95% CI): {np.mean(rmse_samples_both_years):.2f},
      ({np.percentile(rmse_samples_both_years,2.5):.2f}-
       {np.percentile(rmse_samples_both_years,97.5):.2f})""")


Historical Average model BPR (Mean, 95% CI): 60.3,
      (58.5-
       62.1)
Historical Average model MAE (Mean, 95% CI): 0.89,
      (0.88-
       0.90)
Historical Average model RMSE (Mean, 95% CI): 1.25,
      (1.23-
       1.27)


Make Scikit models

In [41]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )

In [42]:
bpr_over_time, mae_over_time, rmse_over_time = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=50)

In [46]:
model_name = 'Weighted Historical Average'

bpr_samples_both_years = np.mean(bpr_over_time, axis=0)
mae_samples_both_years = np.mean(mae_over_time, axis=0)
rmse_samples_both_years = np.mean(rmse_over_time, axis=0)

                        
print(f"""{model_name} model BPR (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

print(f"""{model_name} model MAE (Mean, 95% CI): {np.mean(mae_samples_both_years):.2f},
      ({np.percentile(mae_samples_both_years,2.5):.2f}-
       {np.percentile(mae_samples_both_years,97.5):.2f})""")

print(f"""{model_name} model RMSE (Mean, 95% CI): {np.mean(rmse_samples_both_years):.2f},
      ({np.percentile(rmse_samples_both_years,2.5):.2f}-
       {np.percentile(rmse_samples_both_years,97.5):.2f})""")


Weighted Historical Average model BPR (Mean, 95% CI): 61.3,
      (59.2-
       63.2)
Weighted Historical Average model MAE (Mean, 95% CI): 0.97,
      (0.96-
       0.98)
Weighted Historical Average model RMSE (Mean, 95% CI): 1.28,
      (1.27-
       1.30)


In [47]:
bpr_over_time, mae_over_time, rmse_over_time = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [48]:
model_name = 'Linear'

bpr_samples_both_years = np.mean(bpr_over_time, axis=0)
mae_samples_both_years = np.mean(mae_over_time, axis=0)
rmse_samples_both_years = np.mean(rmse_over_time, axis=0)

                        
print(f"""{model_name} model BPR (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

print(f"""{model_name} model MAE (Mean, 95% CI): {np.mean(mae_samples_both_years):.2f},
      ({np.percentile(mae_samples_both_years,2.5):.2f}-
       {np.percentile(mae_samples_both_years,97.5):.2f})""")

print(f"""{model_name} model RMSE (Mean, 95% CI): {np.mean(rmse_samples_both_years):.2f},
      ({np.percentile(rmse_samples_both_years,2.5):.2f}-
       {np.percentile(rmse_samples_both_years,97.5):.2f})""")


Linear model BPR (Mean, 95% CI): 62.2,
      (60.7-
       63.8)
Linear model MAE (Mean, 95% CI): 0.97,
      (0.96-
       0.98)
Linear model RMSE (Mean, 95% CI): 1.26,
      (1.25-
       1.28)


In [49]:
bpr_over_time, mae_over_time, rmse_over_time = models.scikit_model(multiindexed_gdf, x_BSF_no_svi,
                                               y_BS_no_svi, x_test_BSF_no_svi,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [50]:
model_name = 'Linear No SDOH'

bpr_samples_both_years = np.mean(bpr_over_time, axis=0)
mae_samples_both_years = np.mean(mae_over_time, axis=0)
rmse_samples_both_years = np.mean(rmse_over_time, axis=0)

                        
print(f"""{model_name} model BPR (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

print(f"""{model_name} model MAE (Mean, 95% CI): {np.mean(mae_samples_both_years):.2f},
      ({np.percentile(mae_samples_both_years,2.5):.2f}-
       {np.percentile(mae_samples_both_years,97.5):.2f})""")

print(f"""{model_name} model RMSE (Mean, 95% CI): {np.mean(rmse_samples_both_years):.2f},
      ({np.percentile(rmse_samples_both_years,2.5):.2f}-
       {np.percentile(rmse_samples_both_years,97.5):.2f})""")


Linear No SDOH model BPR (Mean, 95% CI): 60.9,
      (58.8-
       63.0)
Linear No SDOH model MAE (Mean, 95% CI): 0.98,
      (0.97-
       0.99)
Linear No SDOH model RMSE (Mean, 95% CI): 1.29,
      (1.27-
       1.31)


In [67]:
bpr_over_time, mae_over_time, rmse_over_time = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=50)

In [68]:
model_name = 'Gradient Boosted Trees'

bpr_samples_both_years = np.mean(bpr_over_time, axis=0)
mae_samples_both_years = np.mean(mae_over_time, axis=0)
rmse_samples_both_years = np.mean(rmse_over_time, axis=0)

print(f"""{model_name} model BPR (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

print(f"""{model_name} model MAE (Mean, 95% CI): {np.mean(mae_samples_both_years):.2f},
      ({np.percentile(mae_samples_both_years,2.5):.2f}-
       {np.percentile(mae_samples_both_years,97.5):.2f})""")

print(f"""{model_name} model RMSE (Mean, 95% CI): {np.mean(rmse_samples_both_years):.2f},
      ({np.percentile(rmse_samples_both_years,2.5):.2f}-
       {np.percentile(rmse_samples_both_years,97.5):.2f})""")


Gradient Boosted Trees model BPR (Mean, 95% CI): 59.0,
      (57.6-
       61.2)
Gradient Boosted Trees model MAE (Mean, 95% CI): 0.94,
      (0.93-
       0.95)
Gradient Boosted Trees model RMSE (Mean, 95% CI): 1.28,
      (1.26-
       1.30)


In [14]:
import numpy as np

import jax
import jax.numpy as jnp


import scipy.optimize

def calc_neg_log_lik(theta, x_N2, y_N, to_pos=jnp.exp, reduce=True):
    N = x_N2.shape[0]
    y_N = jnp.reshape(y_N, (N,))
    mu_N = to_pos(theta[0] + jnp.dot(x_N2, theta[1:]))
    assert mu_N.shape == (N,)
    assert y_N.shape == mu_N.shape
    loglik_N = -mu_N + y_N * jnp.log(mu_N)
    if reduce:
        return -1.0 * jnp.mean(loglik_N)
    else:
        return -1.0 * loglik_N

class MyPoissonGLM():
    
    def __init__(self, init_theta=np.zeros(3), to_pos=jnp.exp, theta=None):
        self.to_pos = to_pos
        self.init_theta = init_theta
        if theta is not None:
            self.theta = theta
            self.coef_ = theta[1:].copy()
            self.intercept_ = theta[0].copy()
        
    def fit(self, x_N2, y_N):
        calc_grad = jax.grad(calc_neg_log_lik, argnums=[0])

        def f(theta, *args):
            return np.asarray(calc_neg_log_lik(theta, *args), dtype=np.float64, order='F').item()
        def g(theta, *args):
            return np.asarray(calc_grad(theta, *args)[0], dtype=np.float64, order='F').copy()
        ans = scipy.optimize.minimize(
            f, self.init_theta, args=(x_N2, y_N, self.to_pos), jac=g,
            method='L-BFGS-B',
            options={'ftol':1e-13, 'gtol':1e-14})
        self.ans = ans
        self.theta = ans.x
        self.intercept_ = ans.x[0].copy()
        self.coef_ = ans.x[1:].copy()
        return self
        
    def predict(self, x_N2):
        return self.to_pos(self.intercept_ + np.dot(x_N2, self.coef_))
    
    def score(self, x_N2, y_N, eval_logpmf_method='scipy'):
        if eval_logpmf_method.count('scipy'):
            mu_N = self.predict(x_N2)
            return np.mean([-1.0 * scipy.stats.poisson(mu).logpmf(y) for (mu, y) in zip(mu_N, y_N)])
        else:
            return calc_neg_log_lik(self.theta, x_N2, y_N, to_pos=self.to_pos).item()