In [1]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
from pandas import IndexSlice as idx
import os

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessRegressor


# local import
from make_datasets import make_data
import models
from metrics import fast_bpr
import evaluation

2023-10-07 11:48:56.175156: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-07 11:48:56.177196: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-07 11:48:56.219610: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-07 11:48:56.220283: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/results_202308_pipeline/'
data_path= os.path.join( data_dir, './clean_annual_tract/')
data_gdf = gpd.read_file(data_path)

Process dataframe into a data frame with a Multiindex on location and time

In [3]:

# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


validation_year = 2019
first_test_year = 2020
last_test_year = 2021
first_test_timestep = 20
last_test_timestep = 21
lookback_years= 5
first_train_eval_year = validation_year - lookback_years
last_train_eval_year = validation_year -1

In [4]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [5]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)

# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)

2023-10-07 11:49:20.844325: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-10-07 11:49:20.844363: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: s1cmp008.pax.tufts.edu
2023-10-07 11:49:20.844372: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: s1cmp008.pax.tufts.edu
2023-10-07 11:49:20.844469: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.104.5
2023-10-07 11:49:20.844498: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.104.5
2023-10-07 11:49:20.844503: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.104.5


In [6]:
x_BSF.shape

TensorShape([5, 1620, 45])

In [7]:
y_BS.shape

TensorShape([5, 1620])

In [8]:
all_evaluation_deaths = []

for timestep in range(first_test_timestep, last_test_timestep+1):
    evaluation_deaths = multiindexed_gdf.loc[idx[:, timestep], :]
    evaluation_deaths = evaluation_deaths.drop(columns=timestep_col).reset_index().set_index('geoid')['deaths']
    
    # Append the current evaluation_deaths data to the list
    all_evaluation_deaths.append(evaluation_deaths)

evaluation_deaths = all_evaluation_deaths

### All Zeroes Model
#### lookback years =2

In [32]:
bpr_over_time_zeroes, predicted_over_time_zeroes, denominator_deaths = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep,
                                        num_geoids, bpr_uncertainty_samples=15)

In [25]:
print(f"2019 Average: {np.mean(bpr_over_time_zeroes[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_zeroes[0]) + \
                          np.array(bpr_over_time_zeroes[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.2498676865758481
Zeroes model (Mean, 95% CI): 25.1,
      (24.9-
       25.5)


In [26]:
zeroes_rmse_results, zeroes_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_zeroes)

In [27]:
zeroes_rmse_mean, zeroes_rmse_conf_interval = zeroes_rmse_results
zeroes_mae_mean, zeroes_mae_conf_interval = zeroes_mae_results

evaluation.print_results("RMSE for Zeroes Model", zeroes_rmse_mean, zeroes_rmse_conf_interval)
evaluation.print_results("MAE for Zeroes Model", zeroes_mae_mean, zeroes_mae_conf_interval)


RMSE for Zeroes Model (Mean, 95% CI): 1.91, (1.83-2.01)
MAE for Zeroes Model (Mean, 95% CI): 1.24, (1.18-1.30)


In [34]:
print(f"""Zeroes model overdose reach (Mean, 95% CI): {np.mean(bpr_samples_both_years)*np.mean(denominator_deaths):.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*np.mean(denominator_deaths):.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*np.mean(denominator_deaths):.1f})""")


Zeroes model overdose reach (Mean, 95% CI): 123.7,
      (122.8-
       125.8)


In [35]:
np.mean(denominator_deaths)

492.6333333333333

In [12]:
zeroes_rmse_mean, zeroes_rmse_conf_interval = zeroes_rmse_results
zeroes_mae_mean, zeroes_mae_conf_interval = zeroes_mae_results

evaluation.print_results("RMSE for Zeroes Model", zeroes_rmse_mean, zeroes_rmse_conf_interval)
evaluation.print_results("MAE for Zeroes Model", zeroes_mae_mean, zeroes_mae_conf_interval)


RMSE for Zeroes Model (Mean, 95% CI): 1.91, (1.84-1.98)
MAE for Zeroes Model (Mean, 95% CI): 1.24, (1.19-1.28)


### Last Year
#### lookback = 1 year

In [None]:
bpr_over_time_last_time, predicted_over_time_last_time  = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, bpr_uncertainty_samples=50)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_last_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_last_time[0]) + \
                          np.array(bpr_over_time_last_time[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.5583644190051337
Zeroes model (Mean, 95% CI): 53.9,
      (52.4-
       55.3)


In [None]:
len(predicted_over_time_last_time)

2

In [None]:
last_time_rmse_results, last_time_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_last_time)

In [None]:
last_time_rmse_mean, last_time_rmse_conf_interval = last_time_rmse_results
last_time_mae_mean, last_time_mae_conf_interval = last_time_mae_results

evaluation.print_results("RMSE for Last Year Model", last_time_rmse_mean, last_time_rmse_conf_interval)
evaluation.print_results("MAE for Last Year Model", last_time_mae_mean, last_time_mae_conf_interval)


RMSE for Last Year Model (Mean, 95% CI): 1.51, (1.48-1.54)
MAE for Last Year Model (Mean, 95% CI): 1.05, (1.03-1.08)


### Historical Average 
#### lookback = 1 years

In [47]:
bpr_over_time_avg_time, predicted_over_time_avg_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 4, bpr_uncertainty_samples=100, removed_locations=250)

In [48]:
print(f"2019 Average: {np.mean(bpr_over_time_avg_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_avg_time[0]) + \
                          np.array(bpr_over_time_avg_time[1]))/2
                        
print(f"""Hist. Avg  model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.6005169669613087
Hist. Avg  model (Mean, 95% CI): 59.8,
      (57.9-
       62.1)


In [49]:
avg_time_rmse_results, avg_time_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_avg_time)

In [50]:
avg_time_rmse_mean, avg_time_rmse_conf_interval = avg_time_rmse_results
avg_time_mae_mean, avg_time_mae_conf_interval = avg_time_mae_results

evaluation.print_results("RMSE for Historical Average Model", avg_time_rmse_mean, avg_time_rmse_conf_interval)
evaluation.print_results("MAE for Historical Average Model", avg_time_mae_mean, avg_time_mae_conf_interval)


RMSE for Historical Average Model (Mean, 95% CI): 1.28, (1.24-1.33)
MAE for Historical Average Model (Mean, 95% CI): 0.92, (0.89-0.93)


### Weighted Historical Average
#### lookback = 7 years

Make Scikit models

In [24]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )


kernel = RBF(length_scale = x_BSF.shape[-1]*[0.5])
gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

In [None]:
bpr_over_time_weight_avg, predicted_over_time_weight_avg = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_weight_avg[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_weight_avg[0]) + \
                          np.array(bpr_over_time_weight_avg[1]))/2
                        
print(f"""Weighted Hist. Avg  model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.6264956051073434
Weighted Hist. Avg  model (Mean, 95% CI): 61.1,
      (59.2-
       63.4)


In [None]:
weight_avg_rmse_results, weight_avg_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_weight_avg)

In [None]:
weight_avg_rmse_mean, weight_avg_rmse_conf_interval = weight_avg_rmse_results
weight_avg_mae_mean, weight_avg_mae_conf_interval = weight_avg_mae_results

evaluation.print_results("RMSE for Weighted Average Model", weight_avg_rmse_mean, weight_avg_rmse_conf_interval)
evaluation.print_results("MAE for Weighted Average Model", weight_avg_mae_mean, weight_avg_mae_conf_interval)


RMSE for Weighted Average Model (Mean, 95% CI): 1.22, (1.18-1.26)
MAE for Weighted Average Model (Mean, 95% CI): 0.93, (0.90-0.96)


In [None]:
# df_weighted = models.scikit_model_with_coefficients(multiindexed_gdf, x_BSF_death_only,
#                                                y_BS_death_only, x_test_BSF_death_only,
#                                                linear_poisson_weighted_avg,
#                                                first_test_timestep, last_test_timestep,
#                                                bpr_uncertainty_samples=20)


In [None]:
# #df_weighted
# excel_filename = 'weighted_average_df_MA.xlsx'  # Provide the desired filename
# df_weighted.to_excel(excel_filename, index=False)  # Specify index=False to exclude row indices from the output


### Linear (Poisson GLM baseline)

In [16]:
bpr_over_time_linear, predicted_over_time_linear = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [17]:
print(f"2019 Average: {np.mean(bpr_over_time_linear[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_linear[0]) + \
                          np.array(bpr_over_time_linear[1]))/2
                        
print(f"""Linear (Poisson GLM) model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.6091544102988147
Linear (Poisson GLM) model (Mean, 95% CI): 62.7,
      (60.8-
       64.8)


In [21]:
linear_rmse_results, linear_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_linear)

In [22]:
linear_rmse_mean, linear_rmse_conf_interval = linear_rmse_results
linear_mae_mean, linear_mae_conf_interval = linear_mae_results

evaluation.print_results("RMSE for Linear (Poisson GLM)", linear_rmse_mean, linear_rmse_conf_interval)
evaluation.print_results("MAE for Linear (Poisson GLM)", linear_mae_mean, linear_mae_conf_interval)


RMSE for Linear (Poisson GLM) (Mean, 95% CI): 1.28, (1.23-1.32)
MAE for Linear (Poisson GLM) (Mean, 95% CI): 0.98, (0.95-1.00)


In [None]:
# df_linear = models.scikit_model_with_coefficients(multiindexed_gdf, x_BSF,
#                                                y_BS, x_test_BSF,
#                                                linear_poisson,
#                                                first_test_timestep, last_test_timestep,
#                                                bpr_uncertainty_samples=20)

### Gradient Boosted Trees (Poisson)

In [None]:
bpr_over_time_tree, predicted_over_time_tree = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_tree[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_tree[0]) + \
                          np.array(bpr_over_time_tree[1]))/2
                        
print(f"""Gradient Boosted Trees (Poisson)  (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.5583002425125425
Gradient Boosted Trees (Poisson)  (Mean, 95% CI): 57.6,
      (56.3-
       59.3)


In [None]:
tree_rmse_results, tree_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_tree)

In [None]:
tree_rmse_mean, tree_rmse_conf_interval = tree_rmse_results
tree_mae_mean, tree_mae_conf_interval = tree_mae_results

evaluation.print_results("RMSE for Gradient Boosted Trees", tree_rmse_mean, tree_rmse_conf_interval)
evaluation.print_results("MAE for Gradient Boosted Trees", tree_mae_mean, tree_mae_conf_interval)


RMSE for Gradient Boosted Trees (Mean, 95% CI): 1.20, (1.17-1.24)
MAE for Gradient Boosted Trees (Mean, 95% CI): 0.90, (0.88-0.91)


## GPR

In [25]:
bpr_over_time_gp, predicted_over_time_gp = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               gaussian_process,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

print(f"2019 Average: {np.mean(bpr_over_time_gp[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_gp[0]) + \
                          np.array(bpr_over_time_gp[1]))/2
                        
print(f"""GP  (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

gp_rmse_results, gp_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_gp)

gp_rmse_mean, gp_rmse_conf_interval = gp_rmse_results
gp_mae_mean, gp_mae_conf_interval = gp_mae_results

evaluation.print_results("RMSE for GP", gp_rmse_mean, gp_rmse_conf_interval)
evaluation.print_results("MAE for GP", gp_mae_mean, gp_mae_conf_interval)




2019 Average: 0.17648847268614937
GP  (Mean, 95% CI): 16.3,
      (15.2-
       17.2)
RMSE for GP (Mean, 95% CI): 1.90, (1.83-1.98)
MAE for GP (Mean, 95% CI): 1.24, (1.20-1.28)


In [27]:
gaussian_process.kernel_

RBF(length_scale=[1e-05, 1e-05, 1e-05, 1e+05, 1e+05, 1e+05, 1e+05, 1e+05, 49.8, 0.00112, 1e-05, 1e-05, 1e+05, 1e+05, 1e+05, 1e+05, 1e+05, 1e+05, 2.33, 1e-05, 1e-05, 1e+05, 684, 0.365, 0.278, 0.0154, 79.6, 1.49, 1e-05, 1e-05, 1e+05, 3.88e+04, 0.139, 553, 1e+05, 0.0236, 1e-05, 1e-05, 1e-05, 1e+05, 19.6, 40.1, 0.189, 14.9, 6.5e+03])

### CASTNet

In [71]:
cn_result_path = '/cluster/home/kheuto01/code/opioid-overdose-models/CASTNet/hughes-CASTNet/Results/MA-predictions.csv'
cn_location_path = '/cluster/home/kheuto01/code/opioid-overdose-models/CASTNet/hughes-CASTNet/Data/MA/locations.txt'

CN_results = pd.read_csv(cn_result_path)
CN_results['geoid'] = CN_results['geoid'].astype(str)

CN_locations = []
with open(cn_location_path, 'rb') as file:
    for line in file:
        line = line.rstrip().decode("utf-8").split("\t")
        CN_locations.append(line[1])

In [53]:
CN_results

Unnamed: 0,geoid,prediction,year
0,25001010100,0.700970,2020
1,25001010206,1.111377,2020
2,25001010208,0.395809,2020
3,25001010304,0.592120,2020
4,25001010306,0.000000,2020
...,...,...,...
3235,25027761100,0.202868,2021
3236,25027761200,1.173514,2021
3237,25027761300,1.134573,2021
3238,25027761401,0.220063,2021


In [16]:
seed=360
removed_locations=250
first_pred_time = first_test_timestep
last_pred_time = last_test_timestep
location_col='geoid'
dataset_name=False
bpr_uncertainty_samples=50

CN_results = pd.read_csv(cn_result_path)
CN_results['geoid'] = CN_results['geoid'].astype(str)

CN_locations = []
with open(cn_location_path, 'rb') as file:
    for line in file:
        line = line.rstrip().decode("utf-8").split("\t")
        CN_locations.append(line[1])

# sample and calculate BPR
rng = np.random.default_rng(seed=seed)
num_locations = len(CN_locations)
num_sampled = num_locations - removed_locations
results_over_time = []

for timestep in range(first_pred_time, last_pred_time + 1):
    # extract evaluation deaths 
    evaluation_deaths = multiindexed_gdf.loc[idx[:, timestep], :]
    evaluation_deaths = evaluation_deaths.drop(columns=timestep_col).reset_index().set_index(location_col)[outcome_col]

    current_year = 2014 + timestep if dataset_name=='cook-county' else 2000 + timestep 
    predicted_deaths_df = CN_results[(CN_results['year'] == current_year)].set_index('geoid')
    #predicted_deaths = predicted_deaths_df['prediction'].values

    results_over_samples = []
    for _ in range(bpr_uncertainty_samples):
        sampled_indices = rng.choice(range(num_locations), size=num_sampled, replace=False)
        evaluation_deaths_series = evaluation_deaths.iloc[sampled_indices]
        predicted_deaths_sampled = predicted_deaths_df.iloc[sampled_indices]['prediction']
        results_over_samples.append(fast_bpr(evaluation_deaths_series, predicted_deaths_sampled))

    results_over_time.append(results_over_samples)

In [51]:
predicted_deaths_sampled

geoid
25021420201    2.311409
25021980000    1.346856
25017350600    0.196418
25021408104    3.325365
25017310300    1.173893
                 ...   
25027730401    0.295701
25023561100    1.130404
25013800800    1.230816
25013800900    1.195563
25021420301    2.280682
Name: prediction, Length: 1370, dtype: float64

In [72]:
# Call the castnet_model function to calculate BPR for CASTNet predictions
bpr_results_castnet, predicted_results_castnet = models.castnet_model(multiindexed_gdf, False,
                                                                      cn_result_path,cn_location_path,
                                                                      first_test_timestep, last_test_timestep, 
                                            removed_locations=250, bpr_uncertainty_samples=50)


In [76]:
print(f"2019 Average: {np.mean(bpr_results_castnet[1])}")

bpr_samples_both_years = (np.array(bpr_results_castnet[0]) + \
                          np.array(bpr_results_castnet[1]))/2
                        
print(f"""CASTNet model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

2019 Average: 0.5470706904043792
CASTNet model (Mean, 95% CI): 54.4,
      (52.0-
       56.6)


In [74]:
castnet_rmse_results, castnet_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_results_castnet)

In [70]:
castnet_rmse_mean, castnet_rmse_conf_interval = castnet_rmse_results
castnet_mae_mean, castnet_mae_conf_interval = castnet_mae_results

evaluation.print_results("RMSE for CASTNet", castnet_rmse_mean, castnet_rmse_conf_interval)
evaluation.print_results("MAE for CASTNet", castnet_mae_mean, castnet_mae_conf_interval)


RMSE for CASTNet (Mean, 95% CI): 1.47, (1.36-1.58)
MAE for CASTNet (Mean, 95% CI): 1.07, (0.99-1.16)


In [59]:
predicted_results_castnet.shape

(2, 50)

In [17]:
evaluation_deaths

geoid
25001010100    1.0
25001010206    0.0
25001010208    0.0
25001010304    0.0
25001010306    2.0
              ... 
25027761100    1.0
25027761200    0.0
25027761300    0.0
25027761401    1.0
25027761402    1.0
Name: deaths, Length: 1620, dtype: float64

In [20]:
evaluation_deaths_series

geoid
25021420201    1.0
25021980000    0.0
25017350600    1.0
25021408104    0.0
25017310300    3.0
              ... 
25027730401    4.0
25023561100    1.0
25013800800    3.0
25013800900    3.0
25021420301    2.0
Name: deaths, Length: 1370, dtype: float64

In [21]:
predicted_deaths_sampled

geoid
25021420201    2.311409
25021980000    1.346856
25017350600    0.196418
25021408104    3.325365
25017310300    1.173893
                 ...   
25027730401    0.295701
25023561100    1.130404
25013800800    1.230816
25013800900    1.195563
25021420301    2.280682
Name: prediction, Length: 1370, dtype: float64

In [27]:
fast_bpr.__module__

'metrics'

In [38]:
fast_bpr(bpr_results_castnet, predicted_results_castnet)

TypeError: sort_values() missing 1 required positional argument: 'by'

In [44]:
sampled_indices

array([], dtype=int64)

In [48]:
num_sampled

0

In [63]:
predicted_deaths[1]

NameError: name 'predicted_deaths' is not defined

In [65]:
len(predicted_results_castnet)

2