In [1]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
import pickle 
import os
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

# local import
from make_datasets import make_data
import models
import evaluation

2023-08-17 14:46:30.540121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from shapely import wkt
#retrieve cleaned data frames 
data_dir = '/Users/jyontika/Desktop/opioid-overdose-models/cook-county/cleaning-cook-county/'
gdf_annual = pd.read_csv(f'{data_dir}/cook_county_gdf_year.csv')

#convert to gpd (was having trouble importing csv as gdf)
gdf_annual['geometry'] = gdf_annual['geometry'].apply(wkt.loads)
gdf_annual = gpd.GeoDataFrame(gdf_annual, geometry='geometry')
gdf_annual.crs = {'init': 'EPSG:4269'}
type(gdf_annual)

data_gdf = gdf_annual

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [3]:
gdf_annual['geoid'] = gdf_annual['geoid'].astype(str) #change to string

Process dataframe into a data frame with a Multiindex on location and time

In [4]:
data_gdf.shape

(10624, 21)

In [5]:
# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


# #jyontika's parameters
validation_year = 2020
first_test_year = 2021
last_test_year = 2022
first_test_timestep = 7
last_test_timestep = 8
lookback_years= 2 #use 2 lookback years
first_train_eval_year = validation_year - lookback_years #2018
last_train_eval_year = validation_year -1 #2019

In [6]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [7]:
print(lookback_years)
print(num_geoids)
print(len(features_only))

2
1328
9


In [8]:
multiindexed_gdf.shape

(10624, 20)

In [9]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS = make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)
          
# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)


In [10]:
x_BSF.shape

TensorShape([2, 1328, 18])

In [11]:
y_BS.shape

TensorShape([2, 1328])

### All Zeroes Model
### lookback = 2 

In [13]:
bpr_over_time_zeroes, actual_over_time_zeroes, predicted_over_time_zeroes = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep, num_geoids, bpr_uncertainty_samples=15)

In [77]:
print(f"2021 Average: {np.mean(bpr_over_time_zeroes[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_zeroes[0]) + \
                          np.array(bpr_over_time_zeroes[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.21800659589181517
Zeroes model (Mean, 95% CI): 21.7,
      (21.1-
       22.2)


In [32]:
zeroes_rmse_results, zeroes_mae_results  = evaluation.calculate_metrics(actual_over_time_zeroes, predicted_over_time_zeroes, 
                                          first_test_timestep, last_test_timestep, num_uncertainty_samples=15 )

In [33]:
zeroes_rmse_mean, zeroes_rmse_conf_interval = zeroes_rmse_results
zeroes_mae_mean, zeroes_mae_conf_interval = zeroes_mae_results

evaluation.print_results("RMSE for Zeroes Model", zeroes_rmse_mean, zeroes_rmse_conf_interval)
evaluation.print_results("MAE for Zeroes Model", zeroes_mae_mean, zeroes_mae_conf_interval)


RMSE for Zeroes Model (Mean, 95% CI): 2.39, (2.35-2.42)
MAE for Zeroes Model (Mean, 95% CI): 1.33, (1.32-1.33)


### Last Year
#### lookback = 1

In [37]:
bpr_over_time_last_time, actual_over_time_last_time, predicted_over_time_last_time  = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1,bpr_uncertainty_samples=15)

In [38]:
print(f"2021 Average: {np.mean(bpr_over_time_last_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_last_time[0]) + \
                          np.array(bpr_over_time_last_time[1]))/2
                        
print(f"""Last Year model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7274553540862435
Last Year model (Mean, 95% CI): 73.7,
      (71.6-
       76.2)


In [39]:
last_time_rmse_results, last_time_mae_results  = evaluation.calculate_metrics(actual_over_time_last_time, predicted_over_time_last_time, 
                                          first_test_timestep, last_test_timestep, num_uncertainty_samples=15 )

In [40]:
last_time_rmse_mean, last_time_rmse_conf_interval = last_time_rmse_results
last_time_mae_mean, last_time_mae_conf_interval = last_time_mae_results

evaluation.print_results("RMSE for Last Year Model", last_time_rmse_mean, last_time_rmse_conf_interval)
evaluation.print_results("MAE for Last Year Model", last_time_mae_mean, last_time_mae_conf_interval)


RMSE for Last Year Model (Mean, 95% CI): 2.31, (2.18-2.44)
MAE for Last Year Model (Mean, 95% CI): 1.51, (1.33-1.69)


### Historical Average 
#### lookback = 6 years for cook 

In [19]:
bpr_over_time_avg_time, predicted_over_time_avg_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 6, bpr_uncertainty_samples=50)

In [20]:
print(f"2021 Average: {np.mean(bpr_over_time_avg_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_avg_time[0]) + \
                          np.array(bpr_over_time_avg_time[1]))/2
                        
print(f"""Historical Average model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.8215910930729423
Historical Average model (Mean, 95% CI): 81.0,
      (79.3-
       82.9)


In [21]:
# Calculate the model predictions for the years 2019 and 2020
predicted_samples_2021 = predicted_over_time_avg_time[0]
predicted_samples_2022 = predicted_over_time_avg_time[1]

# Calculate RMSE for the Zeroes model for the year 2019 and 2020
rmse_2021 = np.sqrt(np.mean((predicted_samples_2021 - actual_values_2021)**2))
rmse_2022 = np.sqrt(np.mean((predicted_samples_2022 - actual_values_2022)**2))

joint_rmse = np.mean([rmse_2021, rmse_2022])

print(f"Historical Average model RMSE for 2019: {rmse_2021:.2f}")
print(f"Historical Average model RMSE for 2020: {rmse_2022:.2f}")
print(f"Joint RMSE for 2019 and 2020: {joint_rmse:.2f}")
print(" ")

mae_2021_samples = np.mean(np.abs(predicted_samples_2021 - actual_values_2022))
mae_2022_samples = np.mean(np.abs(predicted_samples_2022 - actual_values_2022))

# Take the average MAE for each year
mae_2021 = np.mean(mae_2021_samples)
mae_2022 = np.mean(mae_2022_samples)

# Calculate the joint MAE (mean of individual MAE values from both years)
joint_mae = np.mean([mae_2021, mae_2022])

print(f"Historical Average model MAE for 2021: {mae_2021:.2f}")
print(f"Historical Average model MAE for 2020: {mae_2022:.2f}")
print(f"Joint MAE for 2021 and 2022: {joint_mae:.2f}")

Historical Average model RMSE for 2019: 1.53
Historical Average model RMSE for 2020: 1.46
Joint RMSE for 2019 and 2020: 1.49
 
Historical Average model MAE for 2021: 1.00
Historical Average model MAE for 2020: 0.95
Joint MAE for 2021 and 2022: 0.97


### Weighted Historical Average
#### lookback = 6 years (for cook county)

In [41]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )

In [42]:
bpr_over_time_weight_avg, actual_over_time_weight_avg, predicted_over_time_weight_avg = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

[1.25225843 1.09200799 1.09440722 ... 0.83131589 0.83222862 1.88634815]
[1.64495378 1.43602538 1.88427933 ... 1.09320694 0.95226465 2.15842453]


In [None]:
print(f"2021 Average: {np.mean(bpr_over_time_weight_avg[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_weight_avg[0]) + \
                          np.array(bpr_over_time_weight_avg[1]))/2
                        
print(f"""Weighted Average model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


In [43]:
weight_avg_rmse_results, weight_avg_mae_results  = evaluation.calculate_metrics(actual_over_time_weight_avg, predicted_over_time_weight_avg, 
                                          first_test_timestep, last_test_timestep, num_uncertainty_samples=20)

In [44]:
weight_avg_rmse_mean, weight_avg_rmse_conf_interval = weight_avg_rmse_results
weight_avg_mae_mean, weight_avg_mae_conf_interval = weight_avg_mae_results

evaluation.print_results("RMSE for Weighted Average Model", weight_avg_rmse_mean, weight_avg_rmse_conf_interval)
evaluation.print_results("MAE for Weighted Average Model", weight_avg_mae_mean, weight_avg_mae_conf_interval)


RMSE for Weighted Average Model (Mean, 95% CI): 10.37, (-6.45-27.19)
MAE for Weighted Average Model (Mean, 95% CI): 9.57, (-7.29-26.43)


In [45]:
import tensorflow as tf
from pandas import IndexSlice as idx
from metrics import fast_bpr

# B = x_BSF_death_only of data, # S = locations, # F = features
B, S, F = x_BSF_death_only.shape

# reshape data into 2D for scikit learn models. 1 row = 1 location at 1 time
x_long = tf.reshape(x_BSF_death_only, ((B * S), F))
y_long = tf.reshape(y_BS_death_only, ((B * S), 1))

reg = linear_poisson_weighted_avg.fit(x_long, tf.squeeze(y_long))

# sloppy notation here, it's not the same B
num_test_times = x_test_BSF_death_only.shape[0]

rng = np.random.default_rng(seed=360)
num_sampled = S - 250
results_over_time = []
output_deaths = []

high_prediction_threshold = 200  # Adjust this threshold as needed

for timestep in range(first_test_timestep, last_test_timestep+1):
    evaluation_deaths = multiindexed_gdf.loc[idx[:, timestep], :]
    evaluation_deaths = evaluation_deaths.drop(columns=timestep_col).reset_index().set_index('geoid')[
        outcome_col]
    
    prediction = reg.predict(x_test_BSF_death_only[timestep - first_test_timestep])

    #trying to figure out where the super high deaths are? why is model getting 200+? 
    for location, predicted_value in zip(evaluation_deaths.index, prediction):
        if predicted_value > high_prediction_threshold:
            print(f"High prediction at timestep {timestep}, location {location}: {predicted_value}")

    #print(prediction)
    output_deaths.append(prediction)

    results_over_samples = []

    for _ in range(50):
        sampled_indicies = rng.choice(range(S), size=num_sampled, replace=False)

        results_over_samples.append(
            fast_bpr(evaluation_deaths[sampled_indicies],
                        pd.Series(prediction[sampled_indicies],
                                index=evaluation_deaths[sampled_indicies].index)
                        )
        )

    results_over_time.append(results_over_samples)

    #weighted historical average predicting 200+, bug somewhere?
    #show historical context for predictions


High prediction at timestep 7, location 17031231500: 336.0135685870546
High prediction at timestep 8, location 17031231500: 222.3305946809326


In [25]:
np.max(output_deaths[0])

336.0135685870546

In [26]:
np.max(output_deaths[1])

222.3305946809326

### Linear (Poisson GLM baseline)
#### lookback years = 3

In [46]:
bpr_over_time_linear, actual_over_time_linear, predicted_over_time_linear = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

[1.41099936 1.2945081  1.30874252 ... 0.95782909 0.92025344 2.20476922]
[1.86377385 1.71533257 2.2224617  ... 1.26920445 1.06358965 2.54817817]


In [47]:
print(f"2021 Average: {np.mean(bpr_over_time_linear[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_linear[0]) + \
                          np.array(bpr_over_time_linear[1]))/2
                        
print(f"""Poisson GLM model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7658887879570023
Poisson GLM model (Mean, 95% CI): 76.1,
      (74.5-
       78.1)


In [48]:
linear_rmse_results, linear_mae_results  = evaluation.calculate_metrics(actual_over_time_linear, predicted_over_time_linear, 
                                          first_test_timestep, last_test_timestep, num_uncertainty_samples=20)

In [49]:
linear_rmse_mean, linear_rmse_conf_interval = linear_rmse_results
linear_mae_mean, linear_mae_conf_interval = linear_mae_results

evaluation.print_results("RMSE for Linear (Poisson GLM)", linear_rmse_mean, linear_rmse_conf_interval)
evaluation.print_results("MAE for Linear (Poisson GLM)", linear_mae_mean, linear_mae_conf_interval)


RMSE for Linear (Poisson GLM) (Mean, 95% CI): 9.44, (-5.51-24.40)
MAE for Linear (Poisson GLM) (Mean, 95% CI): 8.65, (-6.34-23.65)


### Gradient Boosted Trees (Poisson)
#### lookback years = 3 

In [50]:
bpr_over_time_tree, actual_over_time_tree, predicted_over_time_tree = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

[1.63098092 1.31546526 1.12939356 ... 0.98639537 0.37331472 2.61947066]
[2.48069631 1.76387372 1.98798501 ... 1.18050964 0.48404729 2.61947066]


In [51]:
print(f"2021 Average: {np.mean(bpr_over_time_tree[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_tree[0]) + \
                          np.array(bpr_over_time_tree[1]))/2
                        
print(f"""Gradient Boosted Trees (Poisson) (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7055868820194819
Gradient Boosted Trees (Poisson) (Mean, 95% CI): 68.1,
      (66.0-
       71.3)


In [52]:
tree_rmse_results, tree_mae_results  = evaluation.calculate_metrics(actual_over_time_tree, predicted_over_time_tree, 
                                          first_test_timestep, last_test_timestep, num_uncertainty_samples=20)

In [53]:
tree_rmse_mean, tree_rmse_conf_interval = tree_rmse_results
tree_mae_mean, tree_mae_conf_interval = tree_mae_results

evaluation.print_results("RMSE for Gradient Boosted Trees", tree_rmse_mean, tree_rmse_conf_interval)
evaluation.print_results("MAE for Gradient Boosted Trees", tree_mae_mean, tree_mae_conf_interval)


RMSE for Gradient Boosted Trees (Mean, 95% CI): 2.13, (2.07-2.18)
MAE for Gradient Boosted Trees (Mean, 95% CI): 1.30, (1.23-1.37)


### CASTNet
#### 3 lookback years

In [54]:
# Call the castnet_model function to calculate BPR for CASTNet predictions
bpr_results_castnet, actual_results_castnet, predicted_results_castnet  = models.castnet_model(multiindexed_gdf, first_test_timestep, last_test_timestep, 
                                            num_geoids, bpr_uncertainty_samples=50)


In [55]:
print(f"2021 Average: {np.mean(bpr_results_castnet[0])}")

bpr_samples_both_years = (np.array(bpr_results_castnet[0]) + \
                          np.array(bpr_results_castnet[1]))/2
                        
print(f"""CASTNet model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

2021 Average: 0.7739815165912338
CASTNet model (Mean, 95% CI): 75.2,
      (73.2-
       76.8)


In [56]:
castnet_rmse_results, castnet_mae_results  = evaluation.calculate_metrics(actual_results_castnet, predicted_results_castnet, 
                                          first_test_timestep, last_test_timestep, num_uncertainty_samples=20)

KeyError: 3

In [None]:
castnet_rmse_mean, castnet_rmse_conf_interval = castnet_rmse_results
castnet_mae_mean, castnet_mae_conf_interval = castnet_mae_results

evaluation.print_results("RMSE for CASTNet", castnet_rmse_mean, castnet_rmse_conf_interval)
evaluation.print_results("MAE for CASTNet", castnet_mae_mean, castnet_mae_conf_interval)
