In [2]:
"""Run a zero-inflated GP on opioid data"""
import os
import sys
from glob import glob

import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import copy

import geopandas as gpd

from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import gpflow
import tensorflow as tf
import sys
code_dir = '/cluster/home/kheuto01/code/zero-inflated-gp/'
sys.path.append(code_dir)
from math import radians, cos, sin, asin, sqrt
from onoffgpf import OnOffSVGP, OnOffLikelihood

import pickle

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    https://stackoverflow.com/a/4913653/1748679
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r


def top_X(y_true, y_pred, X=10):
    top_X_predicted = y_pred.sort_values(ascending=False)[:X]
    top_X_true = y_true.sort_values(ascending=False)[:X]

    undisputed_top_predicted = top_X_predicted[top_X_predicted > top_X_predicted.min()]
    num_tied_spots = X - len(undisputed_top_predicted)
    undisputed_top_true = top_X_true[top_X_true > top_X_true.min()]
    num_true_ties = X - len(undisputed_top_true)

    tied_top_predicted = top_X_predicted[top_X_predicted == top_X_predicted.min()]
    tied_top_true = top_X_true[top_X_true == top_X_true.min()]

    error_in_top_true_ties = np.abs(tied_top_true - y_pred[tied_top_true.index]).sort_values(ascending=True)
    error_in_top_pred_ties = np.abs(y_true[tied_top_predicted.index] - tied_top_predicted).sort_values(ascending=True)
    top_true_tied_geoids = error_in_top_true_ties[:num_true_ties].index
    top_pred_tied_geoids = error_in_top_pred_ties[:num_tied_spots].index

    best_possible_top_true_geoids = pd.Index.union(undisputed_top_true.index, top_true_tied_geoids)
    best_possible_top_pred_geoids = pd.Index.union(undisputed_top_predicted.index, top_pred_tied_geoids)

    # True values of GEOIDS with highest actual deaths. If ties, finds tied locations that match preds best
    best_possible_true = y_true[best_possible_top_true_geoids]
    best_possible_pred = y_true[best_possible_top_pred_geoids]

    assert (len(best_possible_true) == X)
    assert (len(best_possible_pred) == X)

    best_possible_absolute = np.abs(best_possible_true.sum() - best_possible_pred.sum())
    best_possible_ratio = np.abs(best_possible_pred).sum() / np.abs(best_possible_true).sum()

    bootstrapped_tied_indices = np.random.choice(tied_top_predicted.index, (1000, num_tied_spots))
    bootstrapped_all_indices = [pd.Index.union(undisputed_top_predicted.index,
                                               bootstrap_index) for bootstrap_index in bootstrapped_tied_indices]

    bootstrapped_absolute = np.mean([np.abs(top_X_true.sum() - y_true[indices].sum())
                                     for indices in bootstrapped_all_indices])
    bootstrapped_ratio = np.mean([np.abs(y_true[indices]).sum() / np.abs(top_X_true).sum()
                                  for indices in bootstrapped_all_indices])

    return best_possible_absolute, best_possible_ratio, bootstrapped_absolute, bootstrapped_ratio

def normcdf(x):
    return 0.5 * (1.0 + tf.math.erf(x / np.sqrt(2.0))) * (1. - 2.e-3) + 1.e-3


def fixed_top_X(true_qtr_val, pred_qtr_val, X=10):
    top_X_predicted = pred_qtr_val.sort_values(ascending=False)[:X]
    top_X_true = true_qtr_val.sort_values(ascending=False)[:X]

    undisputed_top_predicted = top_X_predicted[top_X_predicted > top_X_predicted.min()]
    num_tied_spots = X - len(undisputed_top_predicted)
    undisputed_top_true = top_X_true[top_X_true > top_X_true.min()]
    num_true_ties = X - len(undisputed_top_true)

    tied_top_predicted = pred_qtr_val[pred_qtr_val == top_X_predicted.min()]
    tied_top_true = true_qtr_val[true_qtr_val == top_X_true.min()]

    error_in_top_true_ties = np.abs(tied_top_true - pred_qtr_val[tied_top_true.index]).sort_values(ascending=True)
    error_in_top_pred_ties = np.abs(true_qtr_val[tied_top_predicted.index] - tied_top_predicted).sort_values(
        ascending=True)
    top_true_tied_geoids = error_in_top_true_ties[:num_true_ties].index
    top_pred_tied_geoids = error_in_top_pred_ties[:num_tied_spots].index

    best_possible_top_true_geoids = pd.Index.union(undisputed_top_true.index, top_true_tied_geoids)
    best_possible_top_pred_geoids = pd.Index.union(undisputed_top_predicted.index, top_pred_tied_geoids)

    # True values of GEOIDS with highest actual deaths. If ties, finds tied locations that match preds best
    best_possible_true = true_qtr_val[best_possible_top_true_geoids]
    best_possible_pred = true_qtr_val[best_possible_top_pred_geoids]

    assert (len(best_possible_true) == X)
    assert (len(best_possible_pred) == X)

    best_possible_absolute = np.abs(best_possible_true.sum() - best_possible_pred.sum())
    best_possible_ratio = np.abs(best_possible_pred).sum() / np.abs(best_possible_true).sum()

    bootstrapped_tied_indices = np.random.choice(tied_top_predicted.index, (1000, num_tied_spots))
    bootstrapped_all_indices = [pd.Index.union(undisputed_top_predicted.index,
                                               bootstrap_index) for bootstrap_index in bootstrapped_tied_indices]

    bootstrapped_absolute = np.mean([np.abs(top_X_true.sum() - true_qtr_val[indices].sum())
                                     for indices in bootstrapped_all_indices])
    bootstrapped_ratio = np.mean([np.abs(true_qtr_val[indices]).sum() / np.abs(top_X_true).sum()
                                  for indices in bootstrapped_all_indices])

    return best_possible_absolute, best_possible_ratio, bootstrapped_absolute, bootstrapped_ratio

In [3]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
result_dir = os.path.join(data_dir, 'results_20220606_update')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

svi_file = os.path.join(result_dir, 'svi_month_town')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT20': 'lat', 'INTPTLON20': 'lon', 'GEOID20': 'grid_squar'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf

# Used when we just need the unique tracts and their locations
just_grid = deaths_gdf.loc[
    (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['grid_squar', 'geometry', 'lat', 'lon']]

# Calculate each squares neighbors
neighbors = {}
for _, row in just_grid.iterrows():
    just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                        x['lon'], x['lat']),
                                                    axis=1)
    matching_neighbors = just_grid[just_grid['haversine'] < 8]['grid_squar'].values
    neighbors[row['grid_squar']] = matching_neighbors

tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', 'month']).sort_index()

month_since_2000 = 0
season_since_2000 = 0
qtr_since_2000 = 0
year_since_2000 = 0
for year in range(min_year, max_year + 1):
    for month in range(1, 12 + 1):

        if month in [1, 2, 3, 4, 5, 6]:
            season = 'jan-jun'
        else:
            season = 'jul-dec'

        if month <= 3:
            qtr = 1
        elif month <= 6:
            qtr = 2
        elif month <= 9:
            qtr = 3
        else:
            qtr = 4

        deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
        deaths_gdf.loc[idx[:, year, month], 'season'] = season
        deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
        deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
        deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
        deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

        month_since_2000 += 1

        if month in [6, 12]:
            season_since_2000 += 1

        if month in [3, 6, 9, 12]:
            qtr_since_2000 += 1

        if month == 12:
            year_since_2000 += 1

deaths_gdf = deaths_gdf.reset_index()

timestep_col = 'quarter'
 
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', 'quarter']).sort_index()
deaths_gdf.loc[idx[:, :, :], 'self_t-1'] = deaths_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = deaths_gdf[~deaths_gdf.index.duplicated(keep='first')]
summed_deaths = deaths_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
deaths_gdf = summed_deaths
for tract in tracts:
    deaths_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        deaths_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'quarter']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for quarter in range(1, 5):
        deaths_gdf.loc[idx[:, year, quarter], 'timestep'] = timestep
        timestep += 1

deaths_gdf_with_autoregressive = deaths_gdf.reset_index()

features = ['grid_squar','year','quarter', 'lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
         'svi_pctile', 'neighbors_last_timestep', 'last_timestep']
features_no_idx = ['lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
 'svi_pctile', 'neighbors_last_timestep', 'last_timestep']

train_x_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'timestep', 'deaths']]
train_x_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'timestep', 'deaths']]
x_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019q1 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 1)][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019q1 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 1)][
    ['grid_squar', 'timestep', 'deaths']]

spatial_kernel = gpflow.kernels.RBF(2, active_dims=[0, 1])
temporal_kernel = gpflow.kernels.RBF(1, active_dims=[2])

2023-02-08 12:18:37.351480: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-02-08 12:18:37.351527: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: p1cmp078.pax.tufts.edu
2023-02-08 12:18:37.351536: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: p1cmp078.pax.tufts.edu
2023-02-08 12:18:37.351688: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.47.3
2023-02-08 12:18:37.351720: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.47.3
2023-02-08 12:18:37.351725: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 510.47.3
2023-02-08 12:18:37.352214: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the fo

In [None]:
import contextily as cx

from shapely.geometry import Point, Polygon

mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')
town_shapefile = os.path.join(data_dir,'shapefiles','MA_2020_Towns',
                              'CENSUS2020TOWNS_POLY.shp')

tract_gdf = gpd.read_file(mass_shapefile)
tract_gdf.loc[:,'TRACTCE'] = tract_gdf['TRACTCE'].astype(int)

town_shapes = gpd.read_file(town_shapefile)
towns_lat_lon = town_shapes.to_crs({'init': 'epsg:4269'}) 

svi_file = os.path.join(result_dir,'svi_month')
svi_gdf = gpd.read_file(svi_file)
just_tracts = svi_gdf.loc[(svi_gdf['year']==2000)&(svi_gdf['month']==1),['GEOID','geometry', 'INTPTLAT', 'INTPTLON']]
just_tracts['points'] = just_tracts.apply(lambda x: Point(np.float(x['INTPTLON']), np.float(x['INTPTLAT'])), axis=1)
just_towns = deaths_gdf.reset_index().loc[(deaths_gdf.reset_index()['year']==2000)&(deaths_gdf.reset_index()['month']==1)]

In [10]:
import pickle
with open('/cluster/home/kheuto01/try_thing_town/model.mod','rb') as f:
    m_city = pickle.load(f)

In [14]:
sg_2019 = m_city.predict_onoffgp(x_just_2019q1.loc[:, ['lat','lon','timestep','theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile', 'neighbor_t-1', 'self_t-1']].values)
sg_2019 = sg_2019[0]

In [19]:
pred_2019_df = pd.Series(sg_2019.numpy().squeeze(), index=x_just_2019q1.grid_squar)


In [39]:
%%capture
tract_level_preds = pd.DataFrame()
for town_id in just_towns.grid_squar.unique():
    
    pred_deaths = pred_2019_df[town_id]
    
    this_town = just_towns[(just_towns['grid_squar']==town_id)]
    town_geo =this_town['geometry'].values[0]
    
    tracts_containing_town = just_tracts[just_tracts.geometry.intersects(town_geo.centroid)]
    tracts_in_town = just_tracts[just_tracts.geometry.centroid.intersects(town_geo)]

    child_tracts = set(np.append(tracts_containing_town.GEOID.values,tracts_in_town.GEOID.values))
    
    these_tract_preds = pd.DataFrame({'grid_squar':list(child_tracts), 'deaths':pred_deaths*len(child_tracts)})
    tract_level_preds =tract_level_preds.append(these_tract_preds)
    


In [52]:
(tract_level_preds['grid_squar']=='2500103690').max()

False

In [53]:
tracts_containing_town

Unnamed: 0,GEOID,geometry,INTPTLAT,INTPTLON,points
1537,25027731601,"POLYGON ((-71.81776 42.26729, -71.81767 42.267...",42.2681308,-71.8095005,POINT (-71.80950 42.26813)


In [54]:
y_just_2019q1

Unnamed: 0,grid_squar,timestep,deaths
76,2500103690,76.0,3.0
164,2500107175,76.0,2.0
252,2500107980,76.0,2.0
340,2500112995,76.0,0.0
428,2500116775,76.0,3.0
...,...,...,...
30524,2502775155,76.0,1.0
30612,2502775400,76.0,0.0
30700,2502777010,76.0,0.0
30788,2502780405,76.0,1.0


In [55]:
svi_file = os.path.join(result_dir, 'svi_month')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT': 'lat', 'INTPTLON': 'lon', 'GEOID': 'grid_squar'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf

# Used when we just need the unique tracts and their locations
just_grid = deaths_gdf.loc[
    (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['grid_squar', 'geometry', 'lat', 'lon']]

# Calculate each squares neighbors
neighbors = {}
for _, row in just_grid.iterrows():
    just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                        x['lon'], x['lat']),
                                                    axis=1)
    matching_neighbors = just_grid[just_grid['haversine'] < 8]['grid_squar'].values
    neighbors[row['grid_squar']] = matching_neighbors

tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', 'month']).sort_index()

month_since_2000 = 0
season_since_2000 = 0
qtr_since_2000 = 0
year_since_2000 = 0
for year in range(min_year, max_year + 1):
    for month in range(1, 12 + 1):

        if month in [1, 2, 3, 4, 5, 6]:
            season = 'jan-jun'
        else:
            season = 'jul-dec'

        if month <= 3:
            qtr = 1
        elif month <= 6:
            qtr = 2
        elif month <= 9:
            qtr = 3
        else:
            qtr = 4

        deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
        deaths_gdf.loc[idx[:, year, month], 'season'] = season
        deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
        deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
        deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
        deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

        month_since_2000 += 1

        if month in [6, 12]:
            season_since_2000 += 1

        if month in [3, 6, 9, 12]:
            qtr_since_2000 += 1

        if month == 12:
            year_since_2000 += 1

deaths_gdf = deaths_gdf.reset_index()
timestep_col = 'year_since_2000'
deaths_gdf.loc[:, 'timestep'] = deaths_gdf.loc[:, timestep_col]
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', timestep_col]).sort_index()
deaths_gdf.loc[idx[:, :, :], 'self_t-1'] = deaths_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = deaths_gdf[~deaths_gdf.index.duplicated(keep='first')]
summed_deaths = deaths_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
deaths_gdf = summed_deaths
for tract in tracts:
    deaths_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        deaths_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', timestep_col]).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0


deaths_gdf_with_autoregressive = deaths_gdf.reset_index()
features = ['grid_squar','year','quarter', 'lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
         'svi_pctile', 'neighbors_last_timestep', 'last_timestep']
features_no_idx = ['lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
 'svi_pctile', 'neighbors_last_timestep', 'last_timestep']

train_x_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'timestep', 'deaths']]
train_x_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'timestep', 'deaths']]
x_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019q1 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 1)][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019q1 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 1)][
    ['grid_squar', 'timestep', 'deaths']]

spatial_kernel = gpflow.kernels.RBF(2, active_dims=[0, 1])
temporal_kernel = gpflow.kernels.RBF(1, active_dims=[2])

In [66]:
tract_ids = []
deaths = []

for tract in y_just_2019q1['grid_squar'].values:
    
    tract_ids.append(tract)
    if tract in tract_level_preds.grid_squar.values:
        deaths.append(tract_level_preds.set_index('grid_squar').loc[tract]['deaths'].mean())
    else:
        deaths.append(0)

In [68]:
final_preds = pd.DataFrame({'grid_squar':tract_ids, 'deaths':deaths})

In [69]:
fixed_top_X

<function __main__.fixed_top_X(true_qtr_val, pred_qtr_val, X=10)>

In [70]:
final_preds

Unnamed: 0,grid_squar,deaths
0,25001010100,0.019381
1,25001010206,0.031325
2,25001010208,0.017843
3,25001010304,0.069675
4,25001010306,0.069675
...,...,...
1615,25027761100,0.313183
1616,25027761200,0.072541
1617,25027761300,0.340030
1618,25027761401,0.148008


In [80]:
fixed_top_X(y_just_2019q1.set_index('grid_squar')['deaths'],final_preds.set_index('grid_squar')['deaths'])

(31.0, 0.5373134328358209, 59.078, 0.11823880597014924)

In [81]:
fixed_top_X

<function __main__.fixed_top_X(true_qtr_val, pred_qtr_val, X=10)>

In [79]:
final_preds

Unnamed: 0,grid_squar,deaths
0,25001010100,0.019381
1,25001010206,0.031325
2,25001010208,0.017843
3,25001010304,0.069675
4,25001010306,0.069675
...,...,...
1615,25027761100,0.313183
1616,25027761200,0.072541
1617,25027761300,0.340030
1618,25027761401,0.148008
