In [1]:
"""Run a zero-inflated GP on opioid data"""
import os
import sys
from glob import glob

import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import copy

import geopandas as gpd

from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import gpflow
import tensorflow as tf
import sys
code_dir = '/cluster/home/kheuto01/code/zero-inflated-gp/'
sys.path.append(code_dir)
from math import radians, cos, sin, asin, sqrt
from onoffgpf import OnOffSVGP, OnOffLikelihood

import pickle

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    https://stackoverflow.com/a/4913653/1748679
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r


def top_X(y_true, y_pred, X=10):
    top_X_predicted = y_pred.sort_values(ascending=False)[:X]
    top_X_true = y_true.sort_values(ascending=False)[:X]

    undisputed_top_predicted = top_X_predicted[top_X_predicted > top_X_predicted.min()]
    num_tied_spots = X - len(undisputed_top_predicted)
    undisputed_top_true = top_X_true[top_X_true > top_X_true.min()]
    num_true_ties = X - len(undisputed_top_true)

    tied_top_predicted = top_X_predicted[top_X_predicted == top_X_predicted.min()]
    tied_top_true = top_X_true[top_X_true == top_X_true.min()]

    error_in_top_true_ties = np.abs(tied_top_true - y_pred[tied_top_true.index]).sort_values(ascending=True)
    error_in_top_pred_ties = np.abs(y_true[tied_top_predicted.index] - tied_top_predicted).sort_values(ascending=True)
    top_true_tied_geoids = error_in_top_true_ties[:num_true_ties].index
    top_pred_tied_geoids = error_in_top_pred_ties[:num_tied_spots].index

    best_possible_top_true_geoids = pd.Index.union(undisputed_top_true.index, top_true_tied_geoids)
    best_possible_top_pred_geoids = pd.Index.union(undisputed_top_predicted.index, top_pred_tied_geoids)

    # True values of GEOIDS with highest actual deaths. If ties, finds tied locations that match preds best
    best_possible_true = y_true[best_possible_top_true_geoids]
    best_possible_pred = y_true[best_possible_top_pred_geoids]

    assert (len(best_possible_true) == X)
    assert (len(best_possible_pred) == X)

    best_possible_absolute = np.abs(best_possible_true.sum() - best_possible_pred.sum())
    best_possible_ratio = np.abs(best_possible_pred).sum() / np.abs(best_possible_true).sum()

    bootstrapped_tied_indices = np.random.choice(tied_top_predicted.index, (1000, num_tied_spots))
    bootstrapped_all_indices = [pd.Index.union(undisputed_top_predicted.index,
                                               bootstrap_index) for bootstrap_index in bootstrapped_tied_indices]

    bootstrapped_absolute = np.mean([np.abs(top_X_true.sum() - y_true[indices].sum())
                                     for indices in bootstrapped_all_indices])
    bootstrapped_ratio = np.mean([np.abs(y_true[indices]).sum() / np.abs(top_X_true).sum()
                                  for indices in bootstrapped_all_indices])

    return best_possible_absolute, best_possible_ratio, bootstrapped_absolute, bootstrapped_ratio

def normcdf(x):
    return 0.5 * (1.0 + tf.math.erf(x / np.sqrt(2.0))) * (1. - 2.e-3) + 1.e-3


def fixed_top_X(true_qtr_val, pred_qtr_val, X=10):
    top_X_predicted = pred_qtr_val.sort_values(ascending=False)[:X]
    top_X_true = true_qtr_val.sort_values(ascending=False)[:X]

    undisputed_top_predicted = top_X_predicted[top_X_predicted > top_X_predicted.min()]
    num_tied_spots = X - len(undisputed_top_predicted)
    undisputed_top_true = top_X_true[top_X_true > top_X_true.min()]
    num_true_ties = X - len(undisputed_top_true)

    tied_top_predicted = pred_qtr_val[pred_qtr_val == top_X_predicted.min()]
    tied_top_true = true_qtr_val[true_qtr_val == top_X_true.min()]

    error_in_top_true_ties = np.abs(tied_top_true - pred_qtr_val[tied_top_true.index]).sort_values(ascending=True)
    error_in_top_pred_ties = np.abs(true_qtr_val[tied_top_predicted.index] - tied_top_predicted).sort_values(
        ascending=True)
    top_true_tied_geoids = error_in_top_true_ties[:num_true_ties].index
    top_pred_tied_geoids = error_in_top_pred_ties[:num_tied_spots].index

    best_possible_top_true_geoids = pd.Index.union(undisputed_top_true.index, top_true_tied_geoids)
    best_possible_top_pred_geoids = pd.Index.union(undisputed_top_predicted.index, top_pred_tied_geoids)

    # True values of GEOIDS with highest actual deaths. If ties, finds tied locations that match preds best
    best_possible_true = true_qtr_val[best_possible_top_true_geoids]
    best_possible_pred = true_qtr_val[best_possible_top_pred_geoids]

    assert (len(best_possible_true) == X)
    assert (len(best_possible_pred) == X)

    best_possible_absolute = np.abs(best_possible_true.sum() - best_possible_pred.sum())
    best_possible_ratio = np.abs(best_possible_pred).sum() / np.abs(best_possible_true).sum()

    bootstrapped_tied_indices = np.random.choice(tied_top_predicted.index, (1000, num_tied_spots))
    bootstrapped_all_indices = [pd.Index.union(undisputed_top_predicted.index,
                                               bootstrap_index) for bootstrap_index in bootstrapped_tied_indices]

    bootstrapped_absolute = np.mean([np.abs(top_X_true.sum() - true_qtr_val[indices].sum())
                                     for indices in bootstrapped_all_indices])
    bootstrapped_ratio = np.mean([np.abs(true_qtr_val[indices]).sum() / np.abs(top_X_true).sum()
                                  for indices in bootstrapped_all_indices])

    return best_possible_absolute, best_possible_ratio, bootstrapped_absolute, bootstrapped_ratio

2023-02-08 16:18:46.869805: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-08 16:18:46.984407: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-08 16:18:46.989612: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-08 16:18:46.989626: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [2]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
result_dir = os.path.join(data_dir, 'results_20220606_update')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

svi_file = os.path.join(result_dir, 'svi_month')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT': 'lat', 'INTPTLON': 'lon', 'GEOID': 'grid_squar'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf


# Used when we just need the unique tracts and their locations
just_grid = deaths_gdf.loc[
    (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['grid_squar', 'geometry', 'lat', 'lon']]

# Calculate each squares neighbors
neighbors = {}
for _, row in just_grid.iterrows():
    just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                        x['lon'], x['lat']),
                                                    axis=1)
    matching_neighbors = just_grid[just_grid['haversine'] < 8]['grid_squar'].values
    neighbors[row['grid_squar']] = matching_neighbors

tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', 'month']).sort_index()

month_since_2000 = 0
season_since_2000 = 0
qtr_since_2000 = 0
year_since_2000 = 0
for year in range(min_year, max_year + 1):
    for month in range(1, 12 + 1):

        if month in [1, 2, 3, 4, 5, 6]:
            season = 'jan-jun'
        else:
            season = 'jul-dec'

        if month <= 3:
            qtr = 1
        elif month <= 6:
            qtr = 2
        elif month <= 9:
            qtr = 3
        else:
            qtr = 4

        deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
        deaths_gdf.loc[idx[:, year, month], 'season'] = season
        deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
        deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
        deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
        deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

        month_since_2000 += 1

        if month in [6, 12]:
            season_since_2000 += 1

        if month in [3, 6, 9, 12]:
            qtr_since_2000 += 1

        if month == 12:
            year_since_2000 += 1

deaths_gdf = deaths_gdf.reset_index()


timestep_col = 'year_since_2000'
 
deaths_gdf.loc[:, 'timestep'] = deaths_gdf.loc[:, timestep_col]
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', timestep_col]).sort_index()
deaths_gdf.loc[idx[:, :, :], 'self_t-1'] = deaths_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = deaths_gdf[~deaths_gdf.index.duplicated(keep='first')]
summed_deaths = deaths_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
deaths_gdf = summed_deaths
for tract in tracts:
    deaths_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        deaths_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', timestep_col]).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0


deaths_gdf_with_autoregressive = deaths_gdf.reset_index()

features = ['grid_squar','year','quarter', 'lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
         'svi_pctile', 'neighbors_last_timestep', 'last_timestep']
features_no_idx = ['lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
 'svi_pctile', 'neighbors_last_timestep', 'last_timestep']

train_x_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'timestep', 'deaths']]
train_x_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'timestep', 'deaths']]
x_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019q1 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 1)][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019q1 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 1)][
    ['grid_squar', 'timestep', 'deaths']]



In [27]:
svi_file = os.path.join(result_dir, 'svi_month')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT': 'lat', 'INTPTLON': 'lon', 'GEOID': 'grid_squar'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf

# Used when we just need the unique tracts and their locations
just_grid = deaths_gdf.loc[
    (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['grid_squar', 'geometry', 'lat', 'lon']]

# Calculate each squares neighbors
neighbors = {}
for _, row in just_grid.iterrows():
    just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                        x['lon'], x['lat']),
                                                    axis=1)
    matching_neighbors = just_grid[just_grid['haversine'] < 8]['grid_squar'].values
    neighbors[row['grid_squar']] = matching_neighbors

tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', 'month']).sort_index()

month_since_2000 = 0
season_since_2000 = 0
qtr_since_2000 = 0
year_since_2000 = 0
for year in range(min_year, max_year + 1):
    for month in range(1, 12 + 1):

        if month in [1, 2, 3, 4, 5, 6]:
            season = 'jan-jun'
        else:
            season = 'jul-dec'

        if month <= 3:
            qtr = 1
        elif month <= 6:
            qtr = 2
        elif month <= 9:
            qtr = 3
        else:
            qtr = 4

        deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
        deaths_gdf.loc[idx[:, year, month], 'season'] = season
        deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
        deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
        deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
        deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

        month_since_2000 += 1

        if month in [6, 12]:
            season_since_2000 += 1

        if month in [3, 6, 9, 12]:
            qtr_since_2000 += 1

        if month == 12:
            year_since_2000 += 1

deaths_gdf = deaths_gdf.reset_index()
timestep_col = 'quarter'
 
deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', 'quarter']).sort_index()
deaths_gdf.loc[idx[:, :, :], 'self_t-1'] = deaths_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = deaths_gdf[~deaths_gdf.index.duplicated(keep='first')]
summed_deaths = deaths_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
deaths_gdf = summed_deaths
for tract in tracts:
    deaths_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        deaths_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'quarter']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for quarter in range(1, 5):
        deaths_gdf.loc[idx[:, year, quarter], 'timestep'] = timestep
        timestep += 1

deaths_gdf_with_autoregressive = deaths_gdf.reset_index()

features = ['grid_squar','year','quarter', 'lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
         'svi_pctile', 'neighbors_last_timestep', 'last_timestep']
features_no_idx = ['lat', 'lon', timestep_col, 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
 'svi_pctile', 'neighbors_last_timestep', 'last_timestep']

train_x_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2018 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2018][
    ['grid_squar', 'timestep', 'deaths']]
train_x_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
train_y_through_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] <= 2019][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2019][
    ['grid_squar', 'timestep', 'deaths']]
x_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2020 = deaths_gdf_with_autoregressive[deaths_gdf_with_autoregressive['year'] == 2020][
    ['grid_squar', 'timestep', 'deaths']]

spatial_kernel = gpflow.kernels.RBF(2, active_dims=[0, 1])
temporal_kernel = gpflow.kernels.RBF(1, active_dims=[2])

x_just_2019q2 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 2)][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019q2 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 2)][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019q3 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 3)][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019q3 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 3)][
    ['grid_squar', 'timestep', 'deaths']]

x_just_2019q4 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 4)][
    ['grid_squar', 'lat', 'lon', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile',
     'neighbor_t-1', 'self_t-1']]
y_just_2019q4 = deaths_gdf_with_autoregressive[
    (deaths_gdf_with_autoregressive['year'] == 2019) & (deaths_gdf_with_autoregressive['quarter'] == 4)][
    ['grid_squar', 'timestep', 'deaths']]

In [None]:
import contextily as cx

from shapely.geometry import Point, Polygon

mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')
town_shapefile = os.path.join(data_dir,'shapefiles','MA_2020_Towns',
                              'CENSUS2020TOWNS_POLY.shp')

tract_gdf = gpd.read_file(mass_shapefile)
tract_gdf.loc[:,'TRACTCE'] = tract_gdf['TRACTCE'].astype(int)

town_shapes = gpd.read_file(town_shapefile)
towns_lat_lon = town_shapes.to_crs({'init': 'epsg:4269'}) 

svi_file = os.path.join(result_dir,'svi_month')
svi_gdf = gpd.read_file(svi_file)
just_tracts = svi_gdf.loc[(svi_gdf['year']==2000)&(svi_gdf['month']==1),['GEOID','geometry', 'INTPTLAT', 'INTPTLON']]
just_tracts['points'] = just_tracts.apply(lambda x: Point(np.float(x['INTPTLON']), np.float(x['INTPTLAT'])), axis=1)
just_towns = deaths_gdf.reset_index().loc[(deaths_gdf.reset_index()['year']==2000)&(deaths_gdf.reset_index()['month']==1)]

In [3]:
import pickle
with open('/cluster/home/kheuto01/try_thing_yr/model.mod','rb') as f:
    m_year = pickle.load(f)

2023-02-08 16:22:27.897440: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-02-08 16:22:27.897484: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: p1cmp078.pax.tufts.edu
2023-02-08 16:22:27.897492: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: p1cmp078.pax.tufts.edu
2023-02-08 16:22:27.897623: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.47.3
2023-02-08 16:22:27.897656: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.47.3
2023-02-08 16:22:27.897662: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 510.47.3
2023-02-08 16:22:27.898081: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the fo

In [16]:
sg_2019 = m_year.predict_onoffgp(x_just_2019q1.loc[:, ['lat','lon','timestep','theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile', 'neighbor_t-1', 'self_t-1']].values)
sg_2019 = sg_2019[0]
pred_2019_df = pd.Series(sg_2019.numpy().squeeze(), index=x_just_2019q1.grid_squar)
fixed_top_X(y_just_2019q1.set_index('grid_squar')['deaths'],pred_2019_df)

(40.0, 0.40298507462686567, 40.0, 0.4029850746268656)

In [28]:
sg_2019 = m_year.predict_onoffgp(x_just_2019q1.loc[:, ['lat','lon','timestep','theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile', 'neighbor_t-1', 'self_t-1']].values)
sg_2019 = sg_2019[0]
pred_2019_df = pd.Series(sg_2019.numpy().squeeze(), index=x_just_2019q2.grid_squar)
fixed_top_X(y_just_2019q2.set_index('grid_squar')['deaths'],pred_2019_df)

(22.0, 0.2413793103448276, 22.0, 0.2413793103448275)

In [29]:
sg_2019 = m_year.predict_onoffgp(x_just_2019q1.loc[:, ['lat','lon','timestep','theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile', 'neighbor_t-1', 'self_t-1']].values)
sg_2019 = sg_2019[0]
pred_2019_df = pd.Series(sg_2019.numpy().squeeze(), index=x_just_2019q3.grid_squar)
fixed_top_X(y_just_2019q3.set_index('grid_squar')['deaths'],pred_2019_df)

(24.0, 0.2727272727272727, 24.0, 0.2727272727272726)

In [30]:
sg_2019 = m_year.predict_onoffgp(x_just_2019q1.loc[:, ['lat','lon','timestep','theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile', 'neighbor_t-1', 'self_t-1']].values)
sg_2019 = sg_2019[0]
pred_2019_df = pd.Series(sg_2019.numpy().squeeze(), index=x_just_2019q4.grid_squar)
fixed_top_X(y_just_2019q4.set_index('grid_squar')['deaths'],pred_2019_df)

(24.0, 0.2, 24.0, 0.20000000000000004)

In [31]:
np.mean([0.4029850746268656,0.2413793103448275,0.2727272727272726,0.20000000000000004])

0.27927291442474145

In [79]:
final_preds

Unnamed: 0,grid_squar,deaths
0,25001010100,0.019381
1,25001010206,0.031325
2,25001010208,0.017843
3,25001010304,0.069675
4,25001010306,0.069675
...,...,...
1615,25027761100,0.313183
1616,25027761200,0.072541
1617,25027761300,0.340030
1618,25027761401,0.148008


In [7]:
pred_2019_df

grid_squar
25001010100    0.224602
25001010206    0.218860
25001010208    0.189577
25001010304    0.283480
25001010306    0.257300
                 ...   
25027761100    0.370158
25027761200    0.481235
25027761300    0.485591
25027761401    0.485148
25027761402    0.497723
Length: 1620, dtype: float64

In [8]:
y_just_2019q1

Unnamed: 0,grid_squar,timestep,deaths
19,25001010100,19.0,1.0
41,25001010206,19.0,1.0
63,25001010208,19.0,0.0
85,25001010304,19.0,1.0
107,25001010306,19.0,0.0
...,...,...,...
35549,25027761100,19.0,0.0
35571,25027761200,19.0,1.0
35593,25027761300,19.0,1.0
35615,25027761401,19.0,0.0


(40.0, 0.40298507462686567, 40.0, 0.4029850746268656)

In [11]:
pred_2019_df

grid_squar
25001010100    0.224602
25001010206    0.218860
25001010208    0.189577
25001010304    0.283480
25001010306    0.257300
                 ...   
25027761100    0.370158
25027761200    0.481235
25027761300    0.485591
25027761401    0.485148
25027761402    0.497723
Length: 1620, dtype: float64

In [26]:
y_just_2019q2

Unnamed: 0,grid_squar,timestep,deaths
