In [6]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
import pickle 
import os
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

# local import
from make_datasets import make_data
import models

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from shapely import wkt
#retrieve cleaned data frames 
data_dir = '../cleaning-cook-county/'
gdf_annual = pd.read_csv(f'{data_dir}/cook_county_gdf_year.csv')

#convert to gpd (was having trouble importing csv as gdf)
gdf_annual['geometry'] = gdf_annual['geometry'].apply(wkt.loads)
gdf_annual = gpd.GeoDataFrame(gdf_annual, geometry='geometry')
gdf_annual.crs = {'init': 'EPSG:4269'}
type(gdf_annual)

data_gdf = gdf_annual

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [10]:
gdf_annual['geoid'] = gdf_annual['geoid'].astype(str) #change to string

Process dataframe into a data frame with a Multiindex on location and time

In [11]:
data_gdf.shape

(10624, 21)

In [61]:
# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


# #jyontika's parameters
validation_year = 2020
first_test_year = 2021
last_test_year = 2022
first_test_timestep = 7
last_test_timestep = 8
lookback_years= 2 #use 2 lookback years
first_train_eval_year = validation_year - lookback_years #2018
last_train_eval_year = validation_year -1 #2019

In [13]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [14]:
print(lookback_years)
print(num_geoids)
print(len(features_only))

2
1328
9


In [15]:
multiindexed_gdf.shape

(10624, 20)

In [16]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS = make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)
          
# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)


2023-08-10 13:43:44.945519: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
x_BSF.shape

TensorShape([2, 1328, 18])

In [18]:
y_BS.shape

TensorShape([2, 1328])

### All Zeroes Model
### lookback = 2 

In [19]:
bpr_over_time_zeroes = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep,
                                        num_geoids, bpr_uncertainty_samples=15)

In [20]:
print(f"2021 Average: {np.mean(bpr_over_time_zeroes[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_zeroes[0]) + \
                          np.array(bpr_over_time_zeroes[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.2247144329072049
Zeroes model (Mean, 95% CI): 22.1,
      (21.6-
       22.8)


In [15]:

# Assuming multiindexed_gdf contains the actual target predictions with 'year' and 'deaths' columns
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_zeroes[0])
bpr_samples_2022 = np.mean(bpr_over_time_zeroes[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Zeroes model RMSE for 2021: {rmse_2021:.2f}")
print(f"Zeroes model RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Zeroes model MAE for 2021: {mae_2021:.2f}")
print(f"Zeroes model MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")

Zeroes model RMSE for 2021: 2.33
Zeroes model RMSE for 2022: 2.36
Joint RMSE for 2021 and 2022: 2.34
 
Zeroes model MAE for 2021: 1.32
Zeroes model MAE for 2022: 1.36
Joint MAE for 2021 and 2022: 1.34


### Last Year
#### lookback = 1

In [16]:
bpr_over_time_last_time = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1,bpr_uncertainty_samples=15)

In [17]:
print(f"2021 Average: {np.mean(bpr_over_time_last_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_last_time[0]) + \
                          np.array(bpr_over_time_last_time[1]))/2
                        
print(f"""Last Year model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.6974018220696714
Last Year model (Mean, 95% CI): 71.4,
      (69.9-
       73.1)


In [18]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_last_time[0])
bpr_samples_2022 = np.mean(bpr_over_time_last_time[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Last Year model RMSE for 2021: {rmse_2021:.2f}")
print(f"Last Year model RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Last Year model model MAE for 2021: {mae_2021:.2f}")
print(f"Last Year model MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Last Year model RMSE for 2021: 2.14
Last Year model RMSE for 2022: 2.15
Joint RMSE for 2021 and 2022: 2.15
 
Last Year model model MAE for 2021: 1.25
Last Year model MAE for 2022: 1.29
Joint MAE for 2021 and 2022: 1.27


### Historical Average 
#### lookback = 6 years for cook 

In [19]:
bpr_over_time_avg_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 6, bpr_uncertainty_samples=15)

In [20]:
print(f"2021 Average: {np.mean(bpr_over_time_avg_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_avg_time[0]) + \
                          np.array(bpr_over_time_avg_time[1]))/2
                        
print(f"""Historical Average model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7771963685743997
Historical Average model (Mean, 95% CI): 80.0,
      (78.5-
       82.3)


In [21]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_avg_time[0])
bpr_samples_2022 = np.mean(bpr_over_time_avg_time[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Historical Average RMSE for 2021: {rmse_2021:.2f}")
print(f"Historical Average RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Historical Average  MAE for 2021: {mae_2021:.2f}")
print(f"Historical Average  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Historical Average RMSE for 2021: 2.12
Historical Average RMSE for 2022: 2.13
Joint RMSE for 2021 and 2022: 2.12
 
Historical Average  MAE for 2021: 1.24
Historical Average  MAE for 2022: 1.28
Joint MAE for 2021 and 2022: 1.26


### Weighted Historical Average
#### lookback = 6 years (for cook county)

In [22]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )

In [23]:
bpr_over_time_weight_avg = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [24]:
print(f"2021 Average: {np.mean(bpr_over_time_weight_avg[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_weight_avg[0]) + \
                          np.array(bpr_over_time_weight_avg[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.9422205385264247
Zeroes model (Mean, 95% CI): 85.7,
      (84.0-
       86.8)


In [25]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_weight_avg[0])
bpr_samples_2022 = np.mean(bpr_over_time_weight_avg[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Weighted Historical Average RMSE for 2021: {rmse_2021:.2f}")
print(f"Weighted Historical Average RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Weighted Historical Average  MAE for 2021: {mae_2021:.2f}")
print(f"Weighted Historical Average  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Weighted Historical Average RMSE for 2021: 2.08
Weighted Historical Average RMSE for 2022: 2.14
Joint RMSE for 2021 and 2022: 2.11
 
Weighted Historical Average  MAE for 2021: 1.22
Weighted Historical Average  MAE for 2022: 1.28
Joint MAE for 2021 and 2022: 1.25


### Linear (Poisson GLM baseline)
#### lookback years = 3

In [26]:
bpr_over_time_linear = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [27]:
print(f"2021 Average: {np.mean(bpr_over_time_linear[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_linear[0]) + \
                          np.array(bpr_over_time_linear[1]))/2
                        
print(f"""Poisson GLM model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.9339573086798942
Poisson GLM model (Mean, 95% CI): 85.2,
      (83.7-
       86.9)


In [28]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_linear[0])
bpr_samples_2022 = np.mean(bpr_over_time_linear[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Linear (Poisson GLM baseline) for 2021: {rmse_2021:.2f}")
print(f"Linear (Poisson GLM baseline) RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Linear (Poisson GLM baseline)  MAE for 2021: {mae_2021:.2f}")
print(f"Linear (Poisson GLM baseline)  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Linear (Poisson GLM baseline) for 2021: 2.08
Linear (Poisson GLM baseline) RMSE for 2022: 2.14
Joint RMSE for 2021 and 2022: 2.11
 
Linear (Poisson GLM baseline)  MAE for 2021: 1.22
Linear (Poisson GLM baseline)  MAE for 2022: 1.28
Joint MAE for 2021 and 2022: 1.25


### Gradient Boosted Trees (Poisson)
#### lookback years = 3 

In [29]:
bpr_over_time_tree = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [30]:
print(f"2021 Average: {np.mean(bpr_over_time_tree[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_tree[0]) + \
                          np.array(bpr_over_time_tree[1]))/2
                        
print(f"""Gradient Boosted Trees (Poisson) (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.824183264526769
Gradient Boosted Trees (Poisson) (Mean, 95% CI): 76.8,
      (74.8-
       78.5)


In [31]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_over_time_tree[0])
bpr_samples_2022 = np.mean(bpr_over_time_tree[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Gradient Boosted Trees (Poisson) RMSE for 2021: {rmse_2021:.2f}")
print(f"Gradient Boosted Trees (Poisson) RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Gradient Boosted Trees (Poisson)  MAE for 2021: {mae_2021:.2f}")
print(f"Gradient Boosted Trees (Poisson)  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


Gradient Boosted Trees (Poisson) RMSE for 2021: 2.10
Gradient Boosted Trees (Poisson) RMSE for 2022: 2.16
Joint RMSE for 2021 and 2022: 2.13
 
Gradient Boosted Trees (Poisson)  MAE for 2021: 1.23
Gradient Boosted Trees (Poisson)  MAE for 2022: 1.29
Joint MAE for 2021 and 2022: 1.26


### CASTNet
#### 3 lookback years

In [63]:
# Call the castnet_model function to calculate BPR for CASTNet predictions
bpr_results_castnet = models.castnet_model(multiindexed_gdf, first_test_timestep, last_test_timestep, 
                        num_geoids, bpr_uncertainty_samples=50)

ValueError: 'a' cannot be empty unless no samples are taken

In [62]:
print(f"2021 Average: {np.mean(bpr_results_castnet[0])}")

bpr_samples_both_years = (np.array(bpr_results_castnet[0]) + \
                          np.array(bpr_results_castnet[1]))/2
                        
print(f"""CASTNet model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

2021 Average: 0.0
CASTNet model (Mean, 95% CI): 0.0,
      (0.0-
       0.0)


In [41]:
actual_values_2021 = multiindexed_gdf[multiindexed_gdf['year'] == 2021]['deaths'].values
actual_values_2022 = multiindexed_gdf[multiindexed_gdf['year'] == 2022]['deaths'].values

# Calculate the model predictions 
bpr_samples_2021 = np.mean(bpr_results_castnet[0])
bpr_samples_2022 = np.mean(bpr_results_castnet[1])

# Calculate RMSE for model
rmse_2021 = np.sqrt(np.mean((bpr_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((bpr_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"CASTNet RMSE for 2021: {rmse_2021:.2f}")
print(f"CASTNet RMSE for 2022: {rmse_2022:.2f}")
print(f"Joint RMSE for 2021 and 2022: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(bpr_samples_2021 - actual_values_2021))
mae_2022_samples = np.mean(np.abs(bpr_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"CASTNet  MAE for 2021: {mae_2021:.2f}")
print(f"CASTNet  MAE for 2022: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")


CASTNet RMSE for 2021: 2.07
CASTNet RMSE for 2022: 2.14
Joint RMSE for 2021 and 2022: 2.11
 
CASTNet  MAE for 2021: 1.21
CASTNet  MAE for 2022: 1.28
Joint MAE for 2021 and 2022: 1.25


In [65]:
#import CASTNet Results 
data_dir = '../../CASTNet/hughes-CASTNet/'
results_path = os.path.join(data_dir, 'Results/cook-county-predictions.csv')
CN_results = pd.read_csv(results_path)

#import CASTNet locations
locations_path = os.path.join(data_dir, 'Data/Chicago/locations.txt')
CN_locations = []
with open(locations_path, 'rb') as file:
    for line in file:
        line = line.rstrip().decode("utf-8").split("\t")
        CN_locations.append(line[1])

In [66]:
multiindexed_gdf

Unnamed: 0_level_0,Unnamed: 1_level_0,year,deaths,STATEFP,COUNTYFP,TRACTCE,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,lat,lon,geometry,theme_1_pc,theme_2_pc,theme_3_pc,theme_4_pc,svi_pctile,timestep
geoid,timestep,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,1,2015,1,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.7266,0.3552,0.6652,0.8699,0.7562,1
17031010100,2,2016,3,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.7266,0.3552,0.6652,0.8699,0.7562,2
17031010100,3,2017,2,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.6993,0.3324,0.6231,0.9294,0.7652,3
17031010100,4,2018,0,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.6993,0.3324,0.6231,0.9294,0.7652,4
17031010100,5,2019,2,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.7721,0.5522,0.7028,0.9275,0.8544,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17031844700,4,2018,6,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,4
17031844700,5,2019,3,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,5
17031844700,6,2020,5,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,6
17031844700,7,2021,4,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,7


In [67]:
seed=360
num_locations=num_geoids
removed_locations=250
first_pred_time = first_test_timestep
last_pred_time = last_test_timestep
from pandas import IndexSlice as idx
timestep_col='timestep'
location_col='geoid'
outcome_col='deaths'
bpr_uncertainty_samples = 3
from models import fast_bpr

In [113]:
rng = np.random.default_rng(seed=seed)
num_sampled = num_locations - removed_locations

results_over_time = []

for timestep in range(first_pred_time, last_pred_time+1):
    evaluation_deaths = multiindexed_gdf.loc[idx[:, timestep], :]
    evaluation_deaths = evaluation_deaths.drop(columns=timestep_col).reset_index().set_index(location_col)[outcome_col]

    current_year = 2014 + timestep
    predicted_deaths_df = CN_results[(CN_results['year'] == current_year) & (CN_results['geoid'].isin(CN_locations))]

    if CN_locations is not None:
        # Match the order of locations with the order of data
        evaluation_deaths = evaluation_deaths.loc[CN_locations]

    results_over_samples = []

    for _ in range(bpr_uncertainty_samples):
        sampled_indices = rng.choice(range(num_locations), size=num_sampled, replace=False)

        # Convert evaluation_deaths into a pandas Series
        evaluation_deaths_series = pd.Series(evaluation_deaths.iloc[sampled_indices].values, index=sampled_indices)

        # Use predicted_deaths_df for the specific year
        predicted_deaths_sampled = pd.Series(predicted_deaths_df.iloc[sampled_indices]['prediction'].values, index=sampled_indices)
        results_over_samples.append(fast_bpr(evaluation_deaths_series, predicted_deaths_sampled))

    results_over_time.append(results_over_samples)



In [114]:
fast_bpr(evaluation_deaths_series, predicted_deaths_sampled)

0.7391304347826088

In [115]:
results_over_time

[[0.7772020725388599, 0.7747603833865816, 0.7483108108108106],
 [0.7165605095541403, 0.7279635258358662, 0.7391304347826088]]

In [70]:
CN_results['geoid'] = CN_results['geoid'].astype(str)

In [71]:
CN_results[(CN_results['geoid'].isin(CN_locations))]

Unnamed: 0,geoid,prediction,year
0,17031010100,3.043193,2021
1,17031010201,2.444749,2021
2,17031010202,3.365390,2021
3,17031010300,0.759657,2021
4,17031010400,0.736677,2021
...,...,...,...
2651,17031843700,0.000000,2022
2652,17031843800,1.690513,2022
2653,17031843900,0.704737,2022
2654,17031844600,0.590481,2022


In [89]:
predicted_deaths_df

Unnamed: 0,geoid,prediction,year
0,17031010100,3.043193,2021
1,17031010201,2.444749,2021
2,17031010202,3.365390,2021
3,17031010300,0.759657,2021
4,17031010400,0.736677,2021
...,...,...,...
1323,17031843700,0.149951,2021
1324,17031843800,2.565945,2021
1325,17031843900,1.196703,2021
1326,17031844600,1.463148,2021


In [53]:
CN_locations

['17031010100',
 '17031010201',
 '17031010202',
 '17031010300',
 '17031010400',
 '17031010501',
 '17031010502',
 '17031010503',
 '17031010600',
 '17031010701',
 '17031010702',
 '17031020100',
 '17031020200',
 '17031020301',
 '17031020302',
 '17031020400',
 '17031020500',
 '17031020601',
 '17031020602',
 '17031020701',
 '17031020702',
 '17031020801',
 '17031020802',
 '17031020901',
 '17031020902',
 '17031030101',
 '17031030102',
 '17031030103',
 '17031030104',
 '17031030200',
 '17031030300',
 '17031030400',
 '17031030500',
 '17031030601',
 '17031030603',
 '17031030604',
 '17031030701',
 '17031030702',
 '17031030703',
 '17031030706',
 '17031030800',
 '17031030900',
 '17031031000',
 '17031031100',
 '17031031200',
 '17031031300',
 '17031031400',
 '17031031501',
 '17031031502',
 '17031031700',
 '17031031800',
 '17031031900',
 '17031032100',
 '17031040100',
 '17031040201',
 '17031040202',
 '17031040300',
 '17031040401',
 '17031040402',
 '17031040600',
 '17031040700',
 '17031040800',
 '170310

In [60]:
multiindexed_gdf

Unnamed: 0_level_0,Unnamed: 1_level_0,year,deaths,STATEFP,COUNTYFP,TRACTCE,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,lat,lon,geometry,theme_1_pc,theme_2_pc,theme_3_pc,theme_4_pc,svi_pctile,timestep
geoid,timestep,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,1,2015,1,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.7266,0.3552,0.6652,0.8699,0.7562,1
17031010100,2,2016,3,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.7266,0.3552,0.6652,0.8699,0.7562,2
17031010100,3,2017,2,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.6993,0.3324,0.6231,0.9294,0.7652,3
17031010100,4,2018,0,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.6993,0.3324,0.6231,0.9294,0.7652,4
17031010100,5,2019,2,17.0,31.0,10100.0,101.0,Census Tract 101,G5020,S,379511.0,0.0,42.021255,-87.66983,"POLYGON ((-87.67720 42.02294, -87.67628 42.022...",0.7721,0.5522,0.7028,0.9275,0.8544,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17031844700,4,2018,6,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,4
17031844700,5,2019,3,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,5
17031844700,6,2020,5,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,6
17031844700,7,2021,4,17.0,31.0,844700.0,8447.0,Census Tract 8447,G5020,S,401852.0,0.0,41.854147,-87.71170,"POLYGON ((-87.71989 41.85268, -87.71989 41.852...",0.9499,0.6778,0.9103,0.9653,0.9760,7


In [84]:
evaluation_deaths_series

125     0
1202    0
1243    0
1157    6
797     0
       ..
975     3
1305    4
1122    1
1051    2
1129    2
Length: 1078, dtype: int64

In [86]:
min(predicted_deaths_sampled.index.values)

1328

In [88]:
predicted_deaths_df.iloc[sampled_indices].values

array([['17031802004', 0.49562705, 2021],
       ['17031410900', 0.030865192, 2021],
       ['17031081403', 1.3580513, 2021],
       ...,
       ['17031081000', 1.6851366, 2021],
       ['17031819000', 0.14012992, 2021],
       ['17031804313', 0.21437049, 2021]], dtype=object)

In [91]:
results_over_samples

[0.7165605095541403, 0.7279635258358662, 0.7391304347826088]

In [92]:
results_over_time

[[0.7772020725388599, 0.7747603833865816, 0.7483108108108106],
 [0.7165605095541403, 0.7279635258358662, 0.7391304347826088]]

In [105]:
predicted_deaths_sampled

720     0.194890
430     0.724540
137     0.944050
240     0.443620
1201    0.147940
          ...   
965     0.139472
1147    0.411543
130     0.302771
1013    2.171164
787     1.909203
Length: 1078, dtype: float64