In [3]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
from pandas import IndexSlice as idx
import os

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

# local import
from make_datasets import make_data
import models
import evaluation

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
user_dir = '/Users/jyontika/Desktop/'
data_dir = 'jyontika-MA-data/data'
data_path= os.path.join(user_dir, data_dir, './clean_annual_tract/')
data_gdf = gpd.read_file(data_path)

Process dataframe into a data frame with a Multiindex on location and time

In [None]:

# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


validation_year = 2018
first_test_year = 2019
last_test_year = 2020
first_test_timestep = 19
last_test_timestep = 20
lookback_years= 5
first_train_eval_year = validation_year - lookback_years
last_train_eval_year = validation_year -1

In [None]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [None]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)

# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)

In [None]:
x_BSF.shape

TensorShape([5, 1620, 45])

In [None]:
y_BS.shape

TensorShape([5, 1620])

In [None]:
all_evaluation_deaths = []

for timestep in range(first_test_timestep, last_test_timestep+1):
    evaluation_deaths = multiindexed_gdf.loc[idx[:, timestep], :]
    evaluation_deaths = evaluation_deaths.drop(columns=timestep_col).reset_index().set_index('geoid')['deaths']
    
    # Append the current evaluation_deaths data to the list
    all_evaluation_deaths.append(evaluation_deaths)

evaluation_deaths = all_evaluation_deaths

### All Zeroes Model
#### lookback years =2

In [None]:
bpr_over_time_zeroes, predicted_over_time_zeroes = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep,
                                        num_geoids, bpr_uncertainty_samples=15)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_zeroes[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_zeroes[0]) + \
                          np.array(bpr_over_time_zeroes[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.25896326804183245
Zeroes model (Mean, 95% CI): 25.4,
      (25.1-
       25.8)


In [None]:
zeroes_rmse_results, zeroes_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_zeroes)

In [None]:
zeroes_rmse_mean, zeroes_rmse_conf_interval = zeroes_rmse_results
zeroes_mae_mean, zeroes_mae_conf_interval = zeroes_mae_results

evaluation.print_results("RMSE for Zeroes Model", zeroes_rmse_mean, zeroes_rmse_conf_interval)
evaluation.print_results("MAE for Zeroes Model", zeroes_mae_mean, zeroes_mae_conf_interval)


RMSE for Zeroes Model (Mean, 95% CI): 1.80, (1.73-1.88)
MAE for Zeroes Model (Mean, 95% CI): 1.17, (1.12-1.22)


### Last Year
#### lookback = 1 year

In [None]:
bpr_over_time_last_time, predicted_over_time_last_time  = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, bpr_uncertainty_samples=50)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_last_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_last_time[0]) + \
                          np.array(bpr_over_time_last_time[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.5583644190051337
Zeroes model (Mean, 95% CI): 53.9,
      (52.4-
       55.3)


In [None]:
len(predicted_over_time_last_time)

2

In [None]:
last_time_rmse_results, last_time_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_last_time)

In [None]:
last_time_rmse_mean, last_time_rmse_conf_interval = last_time_rmse_results
last_time_mae_mean, last_time_mae_conf_interval = last_time_mae_results

evaluation.print_results("RMSE for Last Year Model", last_time_rmse_mean, last_time_rmse_conf_interval)
evaluation.print_results("MAE for Last Year Model", last_time_mae_mean, last_time_mae_conf_interval)


RMSE for Last Year Model (Mean, 95% CI): 1.51, (1.48-1.54)
MAE for Last Year Model (Mean, 95% CI): 1.05, (1.03-1.08)


### Historical Average 
#### lookback = 1 years

In [None]:
bpr_over_time_avg_time, predicted_over_time_avg_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 7, bpr_uncertainty_samples=10)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_avg_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_avg_time[0]) + \
                          np.array(bpr_over_time_avg_time[1]))/2
                        
print(f"""Hist. Avg  model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.6308297301973017
Hist. Avg  model (Mean, 95% CI): 61.0,
      (59.9-
       62.5)


In [None]:
avg_time_rmse_results, avg_time_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_avg_time)

In [None]:
avg_time_rmse_mean, avg_time_rmse_conf_interval = avg_time_rmse_results
avg_time_mae_mean, avg_time_mae_conf_interval = avg_time_mae_results

evaluation.print_results("RMSE for Historical Average Model", avg_time_rmse_mean, avg_time_rmse_conf_interval)
evaluation.print_results("MAE for Historical Average Model", avg_time_mae_mean, avg_time_mae_conf_interval)


RMSE for Historical Average Model (Mean, 95% CI): 1.20, (1.16-1.25)
MAE for Historical Average Model (Mean, 95% CI): 0.87, (0.84-0.89)


### Weighted Historical Average
#### lookback = 7 years

Make Scikit models

In [None]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )

In [None]:
bpr_over_time_weight_avg, predicted_over_time_weight_avg = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_weight_avg[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_weight_avg[0]) + \
                          np.array(bpr_over_time_weight_avg[1]))/2
                        
print(f"""Weighted Hist. Avg  model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.6264956051073434
Weighted Hist. Avg  model (Mean, 95% CI): 61.1,
      (59.2-
       63.4)


In [None]:
weight_avg_rmse_results, weight_avg_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_weight_avg)

In [None]:
weight_avg_rmse_mean, weight_avg_rmse_conf_interval = weight_avg_rmse_results
weight_avg_mae_mean, weight_avg_mae_conf_interval = weight_avg_mae_results

evaluation.print_results("RMSE for Weighted Average Model", weight_avg_rmse_mean, weight_avg_rmse_conf_interval)
evaluation.print_results("MAE for Weighted Average Model", weight_avg_mae_mean, weight_avg_mae_conf_interval)


RMSE for Weighted Average Model (Mean, 95% CI): 1.22, (1.18-1.26)
MAE for Weighted Average Model (Mean, 95% CI): 0.93, (0.90-0.96)


In [None]:
# df_weighted = models.scikit_model_with_coefficients(multiindexed_gdf, x_BSF_death_only,
#                                                y_BS_death_only, x_test_BSF_death_only,
#                                                linear_poisson_weighted_avg,
#                                                first_test_timestep, last_test_timestep,
#                                                bpr_uncertainty_samples=20)


In [None]:
# #df_weighted
# excel_filename = 'weighted_average_df_MA.xlsx'  # Provide the desired filename
# df_weighted.to_excel(excel_filename, index=False)  # Specify index=False to exclude row indices from the output


### Linear (Poisson GLM baseline)

In [None]:
bpr_over_time_linear, predicted_over_time_linear = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_linear[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_linear[0]) + \
                          np.array(bpr_over_time_linear[1]))/2
                        
print(f"""Linear (Poisson GLM) model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.6320362020652359
Linear (Poisson GLM) model (Mean, 95% CI): 61.5,
      (59.8-
       63.6)


In [None]:
linear_rmse_results, linear_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_linear)

In [None]:
linear_rmse_mean, linear_rmse_conf_interval = linear_rmse_results
linear_mae_mean, linear_mae_conf_interval = linear_mae_results

evaluation.print_results("RMSE for Linear (Poisson GLM)", linear_rmse_mean, linear_rmse_conf_interval)
evaluation.print_results("MAE for Linear (Poisson GLM)", linear_mae_mean, linear_mae_conf_interval)


RMSE for Linear (Poisson GLM) (Mean, 95% CI): 1.32, (1.25-1.39)
MAE for Linear (Poisson GLM) (Mean, 95% CI): 1.05, (0.99-1.11)


In [None]:
# df_linear = models.scikit_model_with_coefficients(multiindexed_gdf, x_BSF,
#                                                y_BS, x_test_BSF,
#                                                linear_poisson,
#                                                first_test_timestep, last_test_timestep,
#                                                bpr_uncertainty_samples=20)

### Gradient Boosted Trees (Poisson)

In [None]:
bpr_over_time_tree, predicted_over_time_tree = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [None]:
print(f"2019 Average: {np.mean(bpr_over_time_tree[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_tree[0]) + \
                          np.array(bpr_over_time_tree[1]))/2
                        
print(f"""Gradient Boosted Trees (Poisson)  (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2019 Average: 0.5583002425125425
Gradient Boosted Trees (Poisson)  (Mean, 95% CI): 57.6,
      (56.3-
       59.3)


In [None]:
tree_rmse_results, tree_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_over_time_tree)

In [None]:
tree_rmse_mean, tree_rmse_conf_interval = tree_rmse_results
tree_mae_mean, tree_mae_conf_interval = tree_mae_results

evaluation.print_results("RMSE for Gradient Boosted Trees", tree_rmse_mean, tree_rmse_conf_interval)
evaluation.print_results("MAE for Gradient Boosted Trees", tree_mae_mean, tree_mae_conf_interval)


RMSE for Gradient Boosted Trees (Mean, 95% CI): 1.20, (1.17-1.24)
MAE for Gradient Boosted Trees (Mean, 95% CI): 0.90, (0.88-0.91)


### CASTNet

In [None]:
# Call the castnet_model function to calculate BPR for CASTNet predictions
bpr_results_castnet, predicted_results_castnet = models.castnet_model(multiindexed_gdf, False, first_test_timestep, last_test_timestep, 
                                            num_geoids, bpr_uncertainty_samples=50)


In [None]:
print(f"2019 Average: {np.mean(bpr_results_castnet[0])}")

bpr_samples_both_years = (np.array(bpr_results_castnet[0]) + \
                          np.array(bpr_results_castnet[1]))/2
                        
print(f"""CASTNet model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")

2019 Average: 0.2796877101780102
CASTNet model (Mean, 95% CI): 29.0,
      (27.8-
       30.4)


In [None]:
castnet_rmse_results, castnet_mae_results  = evaluation.calculate_metrics(evaluation_deaths, predicted_results_castnet)

In [None]:
castnet_rmse_mean, castnet_rmse_conf_interval = castnet_rmse_results
castnet_mae_mean, castnet_mae_conf_interval = castnet_mae_results

evaluation.print_results("RMSE for CASTNet", castnet_rmse_mean, castnet_rmse_conf_interval)
evaluation.print_results("MAE for CASTNet", castnet_mae_mean, castnet_mae_conf_interval)


RMSE for CASTNet (Mean, 95% CI): 2.11, (2.05-2.18)
MAE for CASTNet (Mean, 95% CI): 1.59, (1.48-1.70)
