In [1]:
"""Run a zero-inflated GP on opioid data"""
import os
import sys
from glob import glob

import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import copy

import geopandas as gpd

from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import gpflow
import tensorflow as tf
import sys
code_dir = '/cluster/home/kheuto01/code/zero-inflated-gp/'
sys.path.append(code_dir)
from math import radians, cos, sin, asin, sqrt
from onoffgpf import OnOffSVGP, OnOffLikelihood

import pickle

from math import radians, cos, sin, asin, sqrt

2023-03-04 15:05:19.590613: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 15:05:19.750002: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-04 15:05:19.755888: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-04 15:05:19.755903: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [2]:

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    https://stackoverflow.com/a/4913653/1748679
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [3]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
result_dir = os.path.join(data_dir, 'results_20220606_update')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

In [4]:
svi_file = os.path.join(result_dir, 'svi_month_zcta')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT20': 'lat', 'INTPTLON20': 'lon', 'GEOID20': 'geoid'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf


In [6]:
    # Used when we just need the unique tracts and their locations
    just_grid = deaths_gdf.loc[
        (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['geoid', 'geometry', 'lat', 'lon']]

    # Calculate each squares neighbors
    neighbors = {}
    for _, row in just_grid.iterrows():
        just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                            x['lon'], x['lat']),
                                                        axis=1)
        matching_neighbors = just_grid[just_grid['haversine'] < 8]['geoid'].values
        neighbors[row['geoid']] = matching_neighbors

    tracts = deaths_gdf['geoid'].unique()
    min_year = deaths_gdf.year.min()
    max_year = deaths_gdf.year.max()
    deaths_gdf = deaths_gdf.set_index(['geoid', 'year', 'month']).sort_index()

    month_since_2000 = 0
    season_since_2000 = 0
    qtr_since_2000 = 0
    year_since_2000 = 0
    for year in range(min_year, max_year + 1):
        for month in range(1, 12 + 1):

            if month in [1, 2, 3, 4, 5, 6]:
                season = 'jan-jun'
            else:
                season = 'jul-dec'

            if month <= 3:
                qtr = 1
            elif month <= 6:
                qtr = 2
            elif month <= 9:
                qtr = 3
            else:
                qtr = 4

            deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
            deaths_gdf.loc[idx[:, year, month], 'season'] = season
            deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
            deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
            deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
            deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

            month_since_2000 += 1

            if month in [6, 12]:
                season_since_2000 += 1

            if month in [3, 6, 9, 12]:
                qtr_since_2000 += 1

            if month == 12:
                year_since_2000 += 1

    deaths_gdf = deaths_gdf.reset_index()


In [7]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'quarter']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'quarter']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for quarter in range(1, 5):
        cleaned_gdf.loc[idx[:, year, quarter], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [9]:
svi_out_file = os.path.join(result_dir, 'clean_quarter_zcta')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [10]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'season']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'season']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for season in ['jan-jun', 'jul-dec']:
        cleaned_gdf.loc[idx[:, year, season], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [11]:
svi_out_file = os.path.join(result_dir, 'clean_semi_zcta')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [12]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :], 'self_t-1'].groupby(level=['year']).mean().shift(1, fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
        cleaned_gdf.loc[idx[:, year], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [13]:
svi_out_file = os.path.join(result_dir, 'clean_annual_zcta')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)
