In [98]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor

# local import
from make_datasets import make_data
import models

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
%store -r gdf_annual_with_svi
%store -r gdf_quarter_with_svi
%store -r gdf_semi_with_svi

data_gdf = gdf_annual_with_svi

Process dataframe into a data frame with a Multiindex on location and time

In [100]:
# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']

# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = ['deaths']
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']


# #jyontika's parameters
validation_year = 2020
first_test_year = 2021
last_test_year = 2022
first_test_timestep = 6
last_test_timestep = 7
lookback_years= 5
first_train_eval_year = validation_year - lookback_years #2015
last_train_eval_year = validation_year -1 #2019

#can only have 5 lookback years? otherwise year < 2015, and we don't have data for those years.
#running into issues with lookback_years and the train_x_df.values vector becoming 0 (ie, when lookback years is 5)


In [101]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

# re-add the timestep column as a feature because it's useful
multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [102]:
print(lookback_years)
print(num_geoids)
print(len(features_only))

5
1328
9


In [103]:
from pandas import IndexSlice as idx

for eval_year in range(first_train_eval_year, last_train_eval_year + 1):

        timesteps_in_year = multiindexed_gdf[multiindexed_gdf['year'] == eval_year].index.unique(level=timestep_col)
        print(timesteps_in_year) # why is there only one val :(
        timesteps_in_year.sort_values()

        train_x_df = multiindexed_gdf.loc[
            idx[:, min(timesteps_in_year) - lookback_years:max(timesteps_in_year) - 1], features_only]
        
        if len(train_x_df) == 0:
            x =  multiindexed_gdf


Index([2], dtype='int64', name='timestep')
Index([3], dtype='int64', name='timestep')
Index([4], dtype='int64', name='timestep')
Index([5], dtype='int64', name='timestep')
Index([6], dtype='int64', name='timestep')


In [104]:
x_BSF, y_BS = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          features_only, num_geoids)
x_test_BSF, y_test_BS = make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          features_only, num_geoids)
          
# For the weighted historical average model, we only use deaths as features
x_BSF_death_only, y_BS_death_only = make_data(multiindexed_gdf, first_train_eval_year, last_train_eval_year, lookback_years,
          ['deaths'], num_geoids)
x_test_BSF_death_only, y_test_BS_death_only =make_data(multiindexed_gdf, first_test_year, last_test_year, lookback_years,
          ['deaths'], num_geoids)

(0, 9)
Empty DataFrame
Columns: [deaths, lat, lon, timestep, theme_1_pc, theme_2_pc, theme_3_pc, theme_4_pc, svi_pctile]
Index: []


ValueError: cannot reshape array of size 0 into shape (1328,5,9)

In [51]:
x_BSF.shape

TensorShape([1, 1328, 9])

In [52]:
y_BS.shape

TensorShape([1, 1328])

### All Zeroes Model
#### lookback years =1 

In [53]:
bpr_over_time_zeroes = models.all_zeroes_model(multiindexed_gdf,
                                        first_test_timestep, last_test_timestep,
                                        num_geoids, bpr_uncertainty_samples=15)

In [54]:
print(f"2021 Average: {np.mean(bpr_over_time_zeroes[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_zeroes[0]) + \
                          np.array(bpr_over_time_zeroes[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.21106030441478843
Zeroes model (Mean, 95% CI): 21.8,
      (21.1-
       22.3)


### Last Year
#### lookback = 1 year

In [55]:
bpr_over_time_last_time = models.last_time_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1,bpr_uncertainty_samples=15)

In [56]:
print(f"2021 Average: {np.mean(bpr_over_time_last_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_last_time[0]) + \
                          np.array(bpr_over_time_last_time[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.6129691007552631
Zeroes model (Mean, 95% CI): 65.3,
      (62.8-
       67.2)


### Historical Average 
#### lookback = 7 years

In [57]:
bpr_over_time_avg_time = models.historical_average_model(multiindexed_gdf, first_test_timestep, last_test_timestep, num_geoids,
                     1, 7, bpr_uncertainty_samples=15)

In [58]:
print(f"2021 Average: {np.mean(bpr_over_time_avg_time[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_avg_time[0]) + \
                          np.array(bpr_over_time_avg_time[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.6619005197240652
Zeroes model (Mean, 95% CI): 72.1,
      (69.9-
       74.1)


### Weighted Historical Average
#### lookback = 7 years

In [59]:
# Identical models, features are only difference
linear_poisson_weighted_avg = sklearn.linear_model.PoissonRegressor()
linear_poisson = sklearn.linear_model.PoissonRegressor()

# Params selected via grid search on validation. Need to re-do grid search for chicago
hist_poisson =   HistGradientBoostingRegressor(loss="poisson", max_iter=10000, max_depth=3, max_leaf_nodes=2,
                                               l2_regularization=1, min_samples_leaf=100 )

In [62]:
bpr_over_time_weight_avg = models.scikit_model(multiindexed_gdf, x_BSF_death_only,
                                               y_BS_death_only, x_test_BSF_death_only,
                                               linear_poisson_weighted_avg,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [63]:
print(f"2021 Average: {np.mean(bpr_over_time_weight_avg[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_weight_avg[0]) + \
                          np.array(bpr_over_time_weight_avg[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.6583725428313024
Zeroes model (Mean, 95% CI): 82.9,
      (81.8-
       84.4)


### Linear (Poisson GLM baseline)
#### lookback years = ?

In [65]:
bpr_over_time_linear = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               linear_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [66]:
print(f"2021 Average: {np.mean(bpr_over_time_linear[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_linear[0]) + \
                          np.array(bpr_over_time_linear[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.6727326458547467
Zeroes model (Mean, 95% CI): 83.5,
      (82.2-
       85.0)


### Gradient Boosted Trees (Poisson)
#### lookback years = ?

In [68]:
bpr_over_time_tree = models.scikit_model(multiindexed_gdf, x_BSF,
                                               y_BS, x_test_BSF,
                                               hist_poisson,
                                               first_test_timestep, last_test_timestep,
                                               bpr_uncertainty_samples=20)

In [69]:
print(f"2021 Average: {np.mean(bpr_over_time_tree[0])}")

bpr_samples_both_years = (np.array(bpr_over_time_tree[0]) + \
                          np.array(bpr_over_time_tree[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


2021 Average: 0.7044756170008526
Zeroes model (Mean, 95% CI): 74.1,
      (71.6-
       76.5)
