In [1]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

# local import
from make_datasets import make_data
import models

2023-07-31 14:04:39.540254: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
%store -r gdf_annual_with_svi
%store -r gdf_quarter_with_svi
%store -r gdf_semi_with_svi

data_gdf = gdf_annual_with_svi

Process dataframe into a data frame with a Multiindex on location and time

In [4]:
data_gdf.columns

Index(['geoid', 'year', 'deaths', 'STATEFP', 'COUNTYFP', 'TRACTCE', 'NAME',
       'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER', 'lat', 'lon',
       'geometry', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc',
       'theme_4_pc', 'svi_pctile'],
      dtype='object')

In [4]:
# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


# #jyontika's parameters
validation_year = 2020
first_test_year = 2021
last_test_year = 2022
first_test_timestep = 6
last_test_timestep = 7
lookback_years= 3 #use 3 lookback years
first_train_eval_year = validation_year - lookback_years #2015
last_train_eval_year = validation_year -1 #2019

In [5]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [6]:
print(lookback_years)
print(num_geoids)
print(len(features_only))

3
1327
9


In [7]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS = make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)
          
# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)

In [8]:
x_BSF.shape

TensorShape([3, 1327, 27])

In [9]:
y_BS.shape

TensorShape([3, 1327])

### All Zeroes Model
### lookback = 3 (param)

In [10]:
bpr_over_time_zeroes = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep,
                                        num_geoids, bpr_uncertainty_samples=15)

In [11]:
print(f"2021 Average: {np.mean(bpr_over_time_zeroes[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_zeroes[0]) + \
                          np.array(bpr_over_time_zeroes[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.2127998806202798
Zeroes model (Mean, 95% CI): 21.8,
      (21.5-
       22.2)


In [12]:

# Assuming multiindexed_gdf contains the actual target predictions with 'year' and 'deaths' columns
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_zeroes[0])
bpr_samples_2022 = np.mean(bpr_over_time_zeroes[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Zeroes model RMSE for 2021: {rmse_2021:.2f}")
print(f"Zeroes model RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Zeroes model MAE for 2021: {mae_2021:.2f}")
print(f"Zeroes model MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")

Zeroes model RMSE for 2021: 2.33
Zeroes model RMSE for 2022: 2.36
Joint RMSE for 2021 and 2022: 2.34
 
Zeroes model MAE for 2021: 1.32
Zeroes model MAE for 2022: 1.36
Joint MAE for 2021 and 2022: 1.34


### Last Year
#### lookback = 1

In [13]:
bpr_over_time_last_time = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1,bpr_uncertainty_samples=15)

In [14]:
print(f"2021 Average: {np.mean(bpr_over_time_last_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_last_time[0]) + \
                          np.array(bpr_over_time_last_time[1]))/2
                        
print(f"""Last Year model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.6103946608532346
Last Year model (Mean, 95% CI): 65.7,
      (63.1-
       68.6)


In [15]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_last_time[0])
bpr_samples_2022 = np.mean(bpr_over_time_last_time[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Last Year model RMSE for 2021: {rmse_2021:.2f}")
print(f"Last Year model RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Last Year model model MAE for 2021: {mae_2021:.2f}")
print(f"Last Year model MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Last Year model RMSE for 2021: 2.17
Last Year model RMSE for 2022: 2.16
Joint RMSE for 2021 and 2022: 2.16
 
Last Year model model MAE for 2021: 1.26
Last Year model MAE for 2022: 1.29
Joint MAE for 2021 and 2022: 1.28


### Historical Average 
#### lookback = 6 years for cook 

In [16]:
bpr_over_time_avg_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 6, bpr_uncertainty_samples=15)

In [17]:
print(f"2021 Average: {np.mean(bpr_over_time_avg_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_avg_time[0]) + \
                          np.array(bpr_over_time_avg_time[1]))/2
                        
print(f"""Historical Average model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.6612735822500013
Historical Average model (Mean, 95% CI): 72.3,
      (70.8-
       73.5)


In [18]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_avg_time[0])
bpr_samples_2022 = np.mean(bpr_over_time_avg_time[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Historical Average RMSE for 2021: {rmse_2021:.2f}")
print(f"Historical Average RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Historical Average  MAE for 2021: {mae_2021:.2f}")
print(f"Historical Average  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Historical Average RMSE for 2021: 2.15
Historical Average RMSE for 2022: 2.13
Joint RMSE for 2021 and 2022: 2.14
 
Historical Average  MAE for 2021: 1.26
Historical Average  MAE for 2022: 1.28
Joint MAE for 2021 and 2022: 1.27


### Weighted Historical Average
#### lookback = 6 years (for cook county)

In [19]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )

In [20]:
bpr_over_time_weight_avg = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [21]:
print(f"2021 Average: {np.mean(bpr_over_time_weight_avg[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_weight_avg[0]) + \
                          np.array(bpr_over_time_weight_avg[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.8674948601511524
Zeroes model (Mean, 95% CI): 90.1,
      (88.9-
       91.6)


In [22]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_weight_avg[0])
bpr_samples_2022 = np.mean(bpr_over_time_weight_avg[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Weighted Historical Average RMSE for 2021: {rmse_2021:.2f}")
print(f"Weighted Historical Average RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Weighted Historical Average  MAE for 2021: {mae_2021:.2f}")
print(f"Weighted Historical Average  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Weighted Historical Average RMSE for 2021: 2.09
Weighted Historical Average RMSE for 2022: 2.10
Joint RMSE for 2021 and 2022: 2.09
 
Weighted Historical Average  MAE for 2021: 1.23
Weighted Historical Average  MAE for 2022: 1.26
Joint MAE for 2021 and 2022: 1.24


### Linear (Poisson GLM baseline)
#### lookback years = 3

In [23]:
bpr_over_time_linear = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [24]:
print(f"2021 Average: {np.mean(bpr_over_time_linear[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_linear[0]) + \
                          np.array(bpr_over_time_linear[1]))/2
                        
print(f"""Poisson GLM model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.8549741583117767
Poisson GLM model (Mean, 95% CI): 89.5,
      (88.5-
       90.4)


In [25]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_linear[0])
bpr_samples_2022 = np.mean(bpr_over_time_linear[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Linear (Poisson GLM baseline) for 2021: {rmse_2021:.2f}")
print(f"Linear (Poisson GLM baseline) RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Linear (Poisson GLM baseline)  MAE for 2021: {mae_2021:.2f}")
print(f"Linear (Poisson GLM baseline)  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Linear (Poisson GLM baseline) for 2021: 2.10
Linear (Poisson GLM baseline) RMSE for 2022: 2.10
Joint RMSE for 2021 and 2022: 2.10
 
Linear (Poisson GLM baseline)  MAE for 2021: 1.23
Linear (Poisson GLM baseline)  MAE for 2022: 1.26
Joint MAE for 2021 and 2022: 1.24


### Gradient Boosted Trees (Poisson)
#### lookback years = 3 

In [26]:
bpr_over_time_tree = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [27]:
print(f"2021 Average: {np.mean(bpr_over_time_tree[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_tree[0]) + \
                          np.array(bpr_over_time_tree[1]))/2
                        
print(f"""Gradient Boosted Trees (Poisson) (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.8153497943533374
Gradient Boosted Trees (Poisson) (Mean, 95% CI): 80.8,
      (79.2-
       82.4)


In [28]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_tree[0])
bpr_samples_2022 = np.mean(bpr_over_time_tree[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Gradient Boosted Trees (Poisson) RMSE for 2021: {rmse_2021:.2f}")
print(f"Gradient Boosted Trees (Poisson) RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Gradient Boosted Trees (Poisson)  MAE for 2021: {mae_2021:.2f}")
print(f"Gradient Boosted Trees (Poisson)  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Gradient Boosted Trees (Poisson) RMSE for 2021: 2.11
Gradient Boosted Trees (Poisson) RMSE for 2022: 2.13
Joint RMSE for 2021 and 2022: 2.12
 
Gradient Boosted Trees (Poisson)  MAE for 2021: 1.23
Gradient Boosted Trees (Poisson)  MAE for 2022: 1.28
Joint MAE for 2021 and 2022: 1.26
