In [1]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
from pandas import IndexSlice as idx
import pickle 
import os
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from shapely import wkt

# local import
from make_datasets import make_data
import models
import evaluation

2023-08-22 10:37:40.090371: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#retrieve cleaned data frames 
user_dir = '/Users/jyontika/Desktop/'
data_dir = os.path.join(user_dir, 'opioid-overdose-models/cook-county/data/')

gdf_annual = pd.read_csv(f'{data_dir}/cook_county_gdf_year.csv')

#convert to gpd (was having trouble importing csv as gdf)
gdf_annual['geometry'] = gdf_annual['geometry'].apply(wkt.loads)
gdf_annual = gpd.GeoDataFrame(gdf_annual, geometry='geometry')
gdf_annual.crs = {'init': 'EPSG:4269'}
type(gdf_annual)

data_gdf = gdf_annual

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [16]:
gdf_annual['geoid'] = gdf_annual['geoid'].astype(str) #change to string

Process dataframe into a data frame with a Multiindex on location and time

In [17]:
data_gdf.shape

(10624, 21)

In [18]:
# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


# #jyontika's parameters
validation_year = 2020
first_test_year = 2021
last_test_year = 2022
first_test_timestep = 7
last_test_timestep = 8
lookback_years= 2 #use 2 lookback years
first_train_eval_year = validation_year - lookback_years #2018
last_train_eval_year = validation_year -1 #2019

In [19]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [20]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS = make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)
          
# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)


In [21]:
x_BSF.shape

TensorShape([2, 1328, 18])

In [22]:
y_BS.shape

TensorShape([2, 1328])

In [23]:
all_evaluation_deaths = []

for timestep in range(first_test_timestep, last_test_timestep+1):
    evaluation_deaths = multiindexed_gdf.loc[idx[:, timestep], :]
    evaluation_deaths = evaluation_deaths.drop(columns=timestep_col).reset_index().set_index('geoid')['deaths']
    
    # Append the current evaluation_deaths data to the list
    all_evaluation_deaths.append(evaluation_deaths)

evaluation_deaths = all_evaluation_deaths

### All Zeroes Model
### lookback = 2 

In [24]:
bpr_over_time_zeroes, predicted_over_time_zeroes = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep, num_geoids, bpr_uncertainty_samples=15)

In [25]:
print(f"2021 Average: {np.mean(bpr_over_time_zeroes[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_zeroes[0]) + \
                          np.array(bpr_over_time_zeroes[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.21795137642562734
Zeroes model (Mean, 95% CI): 21.7,
      (21.2-
       22.3)


In [26]:
zeroes_rmse_results, zeroes_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_zeroes)

In [27]:
zeroes_rmse_mean, zeroes_rmse_conf_interval = zeroes_rmse_results
zeroes_mae_mean, zeroes_mae_conf_interval = zeroes_mae_results

evaluation.print_results("RMSE for Zeroes Model", zeroes_rmse_mean, zeroes_rmse_conf_interval)
evaluation.print_results("MAE for Zeroes Model", zeroes_mae_mean, zeroes_mae_conf_interval)


RMSE for Zeroes Model (Mean, 95% CI): 2.40, (2.31-2.50)
MAE for Zeroes Model (Mean, 95% CI): 1.35, (1.31-1.39)


### Last Year
#### lookback = 1

In [28]:
bpr_over_time_last_time, predicted_over_time_last_time  = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1,bpr_uncertainty_samples=15)

In [29]:
print(f"2021 Average: {np.mean(bpr_over_time_last_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_last_time[0]) + \
                          np.array(bpr_over_time_last_time[1]))/2
                        
print(f"""Last Year model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7274443471590497
Last Year model (Mean, 95% CI): 73.7,
      (71.5-
       76.2)


In [30]:
last_time_rmse_results, last_time_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_last_time)

In [31]:
last_time_rmse_mean, last_time_rmse_conf_interval = last_time_rmse_results
last_time_mae_mean, last_time_mae_conf_interval = last_time_mae_results

evaluation.print_results("RMSE for Last Year Model", last_time_rmse_mean, last_time_rmse_conf_interval)
evaluation.print_results("MAE for Last Year Model", last_time_mae_mean, last_time_mae_conf_interval)


RMSE for Last Year Model (Mean, 95% CI): 1.64, (1.59-1.69)
MAE for Last Year Model (Mean, 95% CI): 1.06, (1.04-1.08)


### Historical Average 
#### lookback = 6 years for cook 

In [32]:
bpr_over_time_avg_time, predicted_over_time_avg_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 6, bpr_uncertainty_samples=50)

In [33]:
len(predicted_over_time_avg_time)

2

In [34]:
print(f"2021 Average: {np.mean(bpr_over_time_avg_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_avg_time[0]) + \
                          np.array(bpr_over_time_avg_time[1]))/2
                        
print(f"""Historical Average model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.8216168434638973
Historical Average model (Mean, 95% CI): 81.0,
      (79.3-
       82.9)


In [35]:
avg_time_rmse_results, avg_time_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_avg_time)

In [36]:
avg_time_rmse_mean, avg_time_rmse_conf_interval = avg_time_rmse_results
avg_time_mae_mean, avg_time_mae_conf_interval = avg_time_mae_results

evaluation.print_results("RMSE for Historical Average Model", avg_time_rmse_mean, avg_time_rmse_conf_interval)
evaluation.print_results("MAE for Historical Average Model", avg_time_mae_mean, avg_time_mae_conf_interval)


RMSE for Historical Average Model (Mean, 95% CI): 1.45, (1.41-1.48)
MAE for Historical Average Model (Mean, 95% CI): 0.95, (0.93-0.96)


### Weighted Historical Average
#### lookback = 6 years (for cook county)

In [37]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )

In [38]:
bpr_over_time_weight_avg, predicted_over_time_weight_avg = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [39]:
np.max(predicted_over_time_weight_avg)

336.0135685870546

In [40]:
print(f"2021 Average: {np.mean(bpr_over_time_weight_avg[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_weight_avg[0]) + \
                          np.array(bpr_over_time_weight_avg[1]))/2
                        
print(f"""Weighted Average model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7689126387163289
Weighted Average model (Mean, 95% CI): 76.6,
      (74.7-
       78.2)


In [41]:
weight_avg_rmse_results, weight_avg_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_weight_avg)

In [42]:
weight_avg_rmse_mean, weight_avg_rmse_conf_interval = weight_avg_rmse_results
weight_avg_mae_mean, weight_avg_mae_conf_interval = weight_avg_mae_results

evaluation.print_results("RMSE for Weighted Average Model", weight_avg_rmse_mean, weight_avg_rmse_conf_interval)
evaluation.print_results("MAE for Weighted Average Model", weight_avg_mae_mean, weight_avg_mae_conf_interval)


RMSE for Weighted Average Model (Mean, 95% CI): 8.30, (6.83-9.77)
MAE for Weighted Average Model (Mean, 95% CI): 1.34, (1.34-1.34)


In [43]:
# df_weighted = models.scikit_model_with_coefficients(multiindexed_gdf, x_BSF_death_only,
#                                                y_BS_death_only, x_test_BSF_death_only,
#                                                linear_poisson_weighted_avg,
#                                                first_test_timestep, last_test_timestep,
#                                                bpr_uncertainty_samples=20)


In [44]:
# df_weighted = df_weighted.sort_values(by='geoid_predictions', ascending=False)
# years = range(2015, 2023)
# for year in years:
#     df_weighted[f'deaths in {year}'] = None  # Initialize with None

# # Fill in the columns with values from the historical_deaths list
# for index, row in df_weighted.iterrows():
#     for year, deaths in zip(years, row['historical_deaths']):
#         df_weighted.at[index, f'deaths in {year}'] = deaths


# df_weighted = df_weighted.drop(columns=['historical_deaths'])

# # Create columns for each prediction year
# prediction_years = [2021, 2022]
# for year in prediction_years:
#     df_weighted[f'prediction for {year}'] = None  # Initialize with None

# for index, row in df_weighted.iterrows():
#     for year, prediction in zip(prediction_years, row['geoid_predictions']):
#         df_weighted.at[index, f'prediction for {year}'] = prediction

# df_weighted = df_weighted.drop(columns=['geoid_predictions'])


In [45]:
# #df_weighted
# excel_filename = 'weighted_average_df_chicago.xlsx'  # Provide the desired filename
# df_weighted.to_excel(excel_filename, index=False)  # Specify index=False to exclude row indices from the output


### Linear (Poisson GLM baseline)
#### lookback years = 3

In [46]:
bpr_over_time_linear, predicted_over_time_linear = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [47]:
print(f"2021 Average: {np.mean(bpr_over_time_linear[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_linear[0]) + \
                          np.array(bpr_over_time_linear[1]))/2
                        
print(f"""Poisson GLM model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7658887879570023
Poisson GLM model (Mean, 95% CI): 76.1,
      (74.5-
       78.1)


In [48]:
linear_rmse_results, linear_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_linear)

In [49]:
linear_rmse_mean, linear_rmse_conf_interval = linear_rmse_results
linear_mae_mean, linear_mae_conf_interval = linear_mae_results

evaluation.print_results("RMSE for Linear (Poisson GLM)", linear_rmse_mean, linear_rmse_conf_interval)
evaluation.print_results("MAE for Linear (Poisson GLM)", linear_mae_mean, linear_mae_conf_interval)


RMSE for Linear (Poisson GLM) (Mean, 95% CI): 7.50, (6.32-8.67)
MAE for Linear (Poisson GLM) (Mean, 95% CI): 1.29, (1.28-1.31)


In [50]:
# df_linear = models.scikit_model_with_coefficients(multiindexed_gdf, x_BSF,
#                                                y_BS, x_test_BSF,
#                                                linear_poisson,
#                                                first_test_timestep, last_test_timestep,
#                                                bpr_uncertainty_samples=20)

In [51]:
# #df_weighted
# excel_filename = 'linear__df_cook.xlsx'  # Provide the desired filename
# df_linear.to_excel(excel_filename, index=False)  # Specify index=False to exclude row indices from the output


### Gradient Boosted Trees (Poisson)
#### lookback years = 3 

In [52]:
bpr_over_time_tree, predicted_over_time_tree = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [53]:
print(f"2021 Average: {np.mean(bpr_over_time_tree[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_tree[0]) + \
                          np.array(bpr_over_time_tree[1]))/2
                        
print(f"""Gradient Boosted Trees (Poisson) (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7055868820194819
Gradient Boosted Trees (Poisson) (Mean, 95% CI): 68.1,
      (66.0-
       71.3)


In [54]:
tree_rmse_results, tree_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_tree)

In [55]:
tree_rmse_mean, tree_rmse_conf_interval = tree_rmse_results
tree_mae_mean, tree_mae_conf_interval = tree_mae_results

evaluation.print_results("RMSE for Gradient Boosted Trees", tree_rmse_mean, tree_rmse_conf_interval)
evaluation.print_results("MAE for Gradient Boosted Trees", tree_mae_mean, tree_mae_conf_interval)


RMSE for Gradient Boosted Trees (Mean, 95% CI): 1.68, (1.59-1.78)
MAE for Gradient Boosted Trees (Mean, 95% CI): 1.05, (1.01-1.09)


### CASTNet
#### 3 lookback years

In [56]:
# Call the castnet_model function to calculate BPR for CASTNet predictions
bpr_results_castnet, predicted_results_castnet  = models.castnet_model(multiindexed_gdf, True, first_test_timestep, last_test_timestep, 
                                            num_geoids, bpr_uncertainty_samples=50)


In [57]:
print(f"2021 Average: {np.mean(bpr_results_castnet[0])}")

bpr_samples_both_years = (np.array(bpr_results_castnet[0]) + \
                          np.array(bpr_results_castnet[1]))/2
                        
print(f"""CASTNet model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

2021 Average: 0.7739815165912338
CASTNet model (Mean, 95% CI): 75.2,
      (73.2-
       76.8)


In [58]:
castnet_rmse_results, castnet_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_results_castnet)

In [59]:
castnet_rmse_mean, castnet_rmse_conf_interval = castnet_rmse_results
castnet_mae_mean, castnet_mae_conf_interval = castnet_mae_results

evaluation.print_results("RMSE for CASTNet", castnet_rmse_mean, castnet_rmse_conf_interval)
evaluation.print_results("MAE for CASTNet", castnet_mae_mean, castnet_mae_conf_interval)


RMSE for CASTNet (Mean, 95% CI): 1.53, (1.39-1.67)
MAE for CASTNet (Mean, 95% CI): 1.01, (0.98-1.04)
