In [1]:
"""Run a zero-inflated GP on opioid data"""
import os
import sys
from glob import glob

import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import copy

import geopandas as gpd

from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import gpflow
import tensorflow as tf
import sys
code_dir = '/cluster/home/kheuto01/code/zero-inflated-gp/'
sys.path.append(code_dir)
from math import radians, cos, sin, asin, sqrt
from onoffgpf import OnOffSVGP, OnOffLikelihood

import pickle

from math import radians, cos, sin, asin, sqrt

2023-03-02 15:01:22.560327: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-02 15:01:22.693088: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-02 15:01:22.698612: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-02 15:01:22.698625: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [2]:

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    https://stackoverflow.com/a/4913653/1748679
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [3]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
result_dir = os.path.join(data_dir, 'results_20220606_update')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

In [4]:
svi_file = os.path.join(result_dir, 'svi_month_town')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT20': 'lat', 'INTPTLON20': 'lon', 'GEOID20': 'grid_squar'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf


In [5]:
    # Used when we just need the unique tracts and their locations
    just_grid = deaths_gdf.loc[
        (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['grid_squar', 'geometry', 'lat', 'lon']]

    # Calculate each squares neighbors
    neighbors = {}
    for _, row in just_grid.iterrows():
        just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                            x['lon'], x['lat']),
                                                        axis=1)
        matching_neighbors = just_grid[just_grid['haversine'] < 8]['grid_squar'].values
        neighbors[row['grid_squar']] = matching_neighbors

    tracts = deaths_gdf['grid_squar'].unique()
    min_year = deaths_gdf.year.min()
    max_year = deaths_gdf.year.max()
    deaths_gdf = deaths_gdf.set_index(['grid_squar', 'year', 'month']).sort_index()

    month_since_2000 = 0
    season_since_2000 = 0
    qtr_since_2000 = 0
    year_since_2000 = 0
    for year in range(min_year, max_year + 1):
        for month in range(1, 12 + 1):

            if month in [1, 2, 3, 4, 5, 6]:
                season = 'jan-jun'
            else:
                season = 'jul-dec'

            if month <= 3:
                qtr = 1
            elif month <= 6:
                qtr = 2
            elif month <= 9:
                qtr = 3
            else:
                qtr = 4

            deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
            deaths_gdf.loc[idx[:, year, month], 'season'] = season
            deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
            deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
            deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
            deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

            month_since_2000 += 1

            if month in [6, 12]:
                season_since_2000 += 1

            if month in [3, 6, 9, 12]:
                qtr_since_2000 += 1

            if month == 12:
                year_since_2000 += 1

    deaths_gdf = deaths_gdf.reset_index()


In [8]:
deaths_gdf = deaths_gdf.rename(columns={'grid_squar':'geoid'})

In [13]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'quarter']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'quarter']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for quarter in range(1, 5):
        cleaned_gdf.loc[idx[:, year, quarter], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [16]:
svi_out_file = os.path.join(result_dir, 'clean_quarter_town')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [17]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'season']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'season']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for season in ['jan-jun', 'jul-dec']:
        cleaned_gdf.loc[idx[:, year, season], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [19]:
svi_out_file = os.path.join(result_dir, 'clean_semi_town')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [21]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :], 'self_t-1'].groupby(level=['year']).mean().shift(1, fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
        cleaned_gdf.loc[idx[:, year], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [23]:
svi_out_file = os.path.join(result_dir, 'clean_annual_town')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [7]:
svi_file = os.path.join(result_dir, 'svi_month')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT': 'lat', 'INTPTLON': 'lon', 'GEOID': 'geoid'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf


In [None]:
    # Used when we just need the unique tracts and their locations
    just_grid = deaths_gdf.loc[
        (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['geoid', 'geometry', 'lat', 'lon']]

    # Calculate each squares neighbors
    neighbors = {}
    for _, row in just_grid.iterrows():
        just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                            x['lon'], x['lat']),
                                                        axis=1)
        matching_neighbors = just_grid[just_grid['haversine'] < 8]['geoid'].values
        neighbors[row['geoid']] = matching_neighbors

    tracts = deaths_gdf['geoid'].unique()
    min_year = deaths_gdf.year.min()
    max_year = deaths_gdf.year.max()
    deaths_gdf = deaths_gdf.set_index(['geoid', 'year', 'month']).sort_index()

    month_since_2000 = 0
    season_since_2000 = 0
    qtr_since_2000 = 0
    year_since_2000 = 0
    for year in range(min_year, max_year + 1):
        for month in range(1, 12 + 1):

            if month in [1, 2, 3, 4, 5, 6]:
                season = 'jan-jun'
            else:
                season = 'jul-dec'

            if month <= 3:
                qtr = 1
            elif month <= 6:
                qtr = 2
            elif month <= 9:
                qtr = 3
            else:
                qtr = 4

            deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
            deaths_gdf.loc[idx[:, year, month], 'season'] = season
            deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
            deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
            deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
            deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

            month_since_2000 += 1

            if month in [6, 12]:
                season_since_2000 += 1

            if month in [3, 6, 9, 12]:
                qtr_since_2000 += 1

            if month == 12:
                year_since_2000 += 1

    deaths_gdf = deaths_gdf.reset_index()


In [29]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'quarter']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'quarter']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for quarter in range(1, 5):
        cleaned_gdf.loc[idx[:, year, quarter], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [31]:
svi_out_file = os.path.join(result_dir, 'clean_quarter_tract')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [32]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'season']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'season']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for season in ['jan-jun', 'jul-dec']:
        cleaned_gdf.loc[idx[:, year, season], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [33]:
svi_out_file = os.path.join(result_dir, 'clean_semi_tract')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [34]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :], 'self_t-1'].groupby(level=['year']).mean().shift(1, fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
        cleaned_gdf.loc[idx[:, year], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

In [None]:
svi_out_file = os.path.join(result_dir, 'clean_annual_tract')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [13]:
just_grid

Unnamed: 0,geoid,geometry,lat,lon
0,25017333300,"POLYGON ((-71.16139 42.47071, -71.16136 42.471...",42.470764,-71.152055
1,25017333400,"POLYGON ((-71.14690 42.48077, -71.14673 42.480...",42.484548,-71.132032
2,25017333502,"POLYGON ((-71.15960 42.48482, -71.15954 42.484...",42.485040,-71.146119
3,25017354500,"POLYGON ((-71.13077 42.38261, -71.13070 42.382...",42.384745,-71.125040
4,25017354700,"POLYGON ((-71.12901 42.39007, -71.12768 42.389...",42.389581,-71.122629
...,...,...,...,...
1615,25009208302,"POLYGON ((-71.01723 42.48901, -71.01705 42.489...",42.485038,-71.004081
1616,25009212101,"POLYGON ((-71.05565 42.62288, -71.05509 42.625...",42.613694,-71.005231
1617,25009208401,"POLYGON ((-71.04695 42.48796, -71.04672 42.488...",42.487612,-71.030763
1618,25009208402,"POLYGON ((-71.05389 42.47878, -71.05373 42.479...",42.466251,-71.035971


In [5]:
svi_file = os.path.join(result_dir, 'svi_month')
svi_gdf = gpd.read_file(svi_file)
# Call it "grid_squar" because geopandas only supports len 10 columns
svi_gdf = svi_gdf.rename(columns={'INTPTLAT': 'lat', 'INTPTLON': 'lon', 'GEOID': 'geoid'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf


# m
five_aggregate = {}
just_grid = deaths_gdf.loc[
        (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['geoid', 'geometry', 'lat', 'lon']]

filtering_list = just_grid.copy()
for r, row in just_grid.iterrows():
    filtering_list.loc[:, 'haversine'] = filtering_list.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                        x['lon'], x['lat']),
                                                    axis=1)
    sorted_dist = filtering_list.sort_values('haversine')
    closest_five = sorted_dist.iloc[:5].geoid.unique()
    five_aggregate[r] = closest_five.tolist()
    filtering_list = filtering_list[~filtering_list['geoid'].isin(closest_five)]
    just_grid.loc[just_grid['geoid'].isin(closest_five), 'grouping'] = r
    
    if len(filtering_list) == 0:
        break
        
deaths_gdf = deaths_gdf.merge(just_grid[['geoid','grouping']], on='geoid')
deaths_gdf = deaths_gdf.rename(columns={'geoid':'tractgeoid', 'grouping':'geoid'})


just_grid = deaths_gdf.loc[
        (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['geoid', 'geometry', 'lat', 'lon']]
just_grid = just_grid.dissolve('geoid')[['geometry']]
just_grid.loc[:,'lat'] = just_grid.centroid.apply(lambda x: x.y)
just_grid.loc[:,'lon'] = just_grid.centroid.apply(lambda x: x.x)
just_grid = just_grid.reset_index()

svi_cols = ['theme_1_pc','theme_2_pc','theme_3_pc','theme_4_pc','svi_pctile']

grouped_gdf = deaths_gdf.groupby(['geoid','year','month']).sum()[['deaths']]

avg_svi = deaths_gdf.groupby(['geoid','year','month']).mean()[svi_cols]

grouped_gdf = grouped_gdf.merge(avg_svi, left_index=True, right_index=True)


deaths_gdf = grouped_gdf.reset_index()

deaths_gdf = deaths_gdf.merge(just_grid[['geoid','lat','lon', 'geometry']], on='geoid')




  just_grid.loc[:,'lat'] = just_grid.centroid.apply(lambda x: x.y)

  just_grid.loc[:,'lon'] = just_grid.centroid.apply(lambda x: x.x)


In [6]:
    # Used when we just need the unique tracts and their locations
    just_grid = deaths_gdf.loc[
        (deaths_gdf['year'] == 2000) & (deaths_gdf['month'] == 1), ['geoid', 'geometry', 'lat', 'lon']]

    # Calculate each squares neighbors
    neighbors = {}
    for _, row in just_grid.iterrows():
        just_grid.loc[:, 'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                            x['lon'], x['lat']),
                                                        axis=1)
        matching_neighbors = just_grid[just_grid['haversine'] < 8]['geoid'].values
        neighbors[row['geoid']] = matching_neighbors

    tracts = deaths_gdf['geoid'].unique()
    min_year = deaths_gdf.year.min()
    max_year = deaths_gdf.year.max()
    deaths_gdf = deaths_gdf.set_index(['geoid', 'year', 'month']).sort_index()

    month_since_2000 = 0
    season_since_2000 = 0
    qtr_since_2000 = 0
    year_since_2000 = 0
    for year in range(min_year, max_year + 1):
        for month in range(1, 12 + 1):

            if month in [1, 2, 3, 4, 5, 6]:
                season = 'jan-jun'
            else:
                season = 'jul-dec'

            if month <= 3:
                qtr = 1
            elif month <= 6:
                qtr = 2
            elif month <= 9:
                qtr = 3
            else:
                qtr = 4

            deaths_gdf.loc[idx[:, year, month], 'month_since_2000'] = month_since_2000
            deaths_gdf.loc[idx[:, year, month], 'season'] = season
            deaths_gdf.loc[idx[:, year, month], 'season_since_2000'] = season_since_2000
            deaths_gdf.loc[idx[:, year, month], 'quarter'] = qtr
            deaths_gdf.loc[idx[:, year, month], 'qtr_since_2000'] = qtr_since_2000
            deaths_gdf.loc[idx[:, year, month], 'year_since_2000'] = year_since_2000

            month_since_2000 += 1

            if month in [6, 12]:
                season_since_2000 += 1

            if month in [3, 6, 9, 12]:
                qtr_since_2000 += 1

            if month == 12:
                year_since_2000 += 1

    deaths_gdf = deaths_gdf.reset_index()


In [110]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'quarter']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'quarter']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for quarter in range(1, 5):
        cleaned_gdf.loc[idx[:, year, quarter], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

svi_out_file = os.path.join(result_dir, 'clean_quarter_group')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [111]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year', 'season']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1,2]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :, :], 'self_t-1'].groupby(level=['year', 'season']).mean().shift(1,
                                                                                                                fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
    for season in ['jan-jun', 'jul-dec']:
        cleaned_gdf.loc[idx[:, year, season], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

svi_out_file = os.path.join(result_dir, 'clean_semi_group')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [7]:

cleaned_gdf = deaths_gdf.set_index(['geoid', 'year']).sort_index()
cleaned_gdf.loc[idx[:, :, :], 'self_t-1'] = cleaned_gdf.loc[idx[:, :], 'deaths'].shift(1, fill_value=0)
unduped_gdf = cleaned_gdf[~cleaned_gdf.index.duplicated(keep='first')]
summed_deaths = cleaned_gdf.groupby(level=[0,1]).sum()[['deaths']]
summed_deaths = summed_deaths.merge(unduped_gdf, how='left', left_index=True, right_index=True,suffixes=[None,'_garbage'])
summed_deaths = summed_deaths.drop('deaths_garbage',axis=1)
cleaned_gdf = summed_deaths
for tract in tracts:
    cleaned_gdf.loc[idx[tract, :], 'neighbor_t-1'] = \
        cleaned_gdf.loc[idx[neighbors[tract], :], 'self_t-1'].groupby(level=['year']).mean().shift(1, fill_value=0).values

timestep = 0

for year in range(min_year, max_year + 1):
        cleaned_gdf.loc[idx[:, year], 'timestep'] = timestep
        timestep += 1

cleaned_gdf = cleaned_gdf.reset_index()

svi_out_file = os.path.join(result_dir, 'clean_annual_group')
gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)

  gpd.GeoDataFrame(cleaned_gdf).to_file(svi_out_file)


In [9]:
from_disk = gpd.read_file('/cluster/tufts/hugheslab/datasets/NSF_OD/results_20220606_update/clean_annual_group')

In [10]:
from_disk

Unnamed: 0,geoid,year,deaths,month,theme_1_pc,theme_2_pc,theme_3_pc,theme_4_pc,svi_pctile,lat,...,month_sinc,season,season_sin,quarter,qtr_since_,year_since,self_t-1,neighbor_t,timestep,geometry
0,0.0,2000,0.0,1,0.49780,0.44132,0.50016,0.55846,0.53140,42.470575,...,0.0,jan-jun,0.0,1.0,0.0,0.0,0.0,0.0000,0.0,"POLYGON ((-71.16300 42.46020, -71.15777 42.457..."
1,0.0,2001,1.0,1,0.49780,0.44132,0.50016,0.55846,0.53140,42.470575,...,12.0,jan-jun,2.0,1.0,4.0,1.0,0.0,0.4375,1.0,"POLYGON ((-71.16300 42.46020, -71.15777 42.457..."
2,0.0,2002,2.0,1,0.49780,0.44132,0.50016,0.55846,0.53140,42.470575,...,24.0,jan-jun,4.0,1.0,8.0,2.0,0.0,0.0625,2.0,"POLYGON ((-71.16300 42.46020, -71.15777 42.457..."
3,0.0,2003,2.0,1,0.49780,0.44132,0.50016,0.55846,0.53140,42.470575,...,36.0,jan-jun,6.0,1.0,12.0,3.0,0.0,0.0625,3.0,"POLYGON ((-71.16300 42.46020, -71.15777 42.457..."
4,0.0,2004,3.0,1,0.49780,0.44132,0.50016,0.55846,0.53140,42.470575,...,48.0,jan-jun,8.0,1.0,16.0,4.0,0.0,0.1250,4.0,"POLYGON ((-71.16300 42.46020, -71.15777 42.457..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7123,323.0,2017,0.0,1,0.41834,0.46742,0.34016,0.48966,0.41618,42.172543,...,204.0,jan-jun,34.0,1.0,68.0,17.0,0.0,0.0000,17.0,"POLYGON ((-73.29697 42.15849, -73.29690 42.158..."
7124,323.0,2018,2.0,1,0.41834,0.46742,0.34016,0.48966,0.41618,42.172543,...,216.0,jan-jun,36.0,1.0,72.0,18.0,0.0,0.0000,18.0,"POLYGON ((-73.29697 42.15849, -73.29690 42.158..."
7125,323.0,2019,3.0,1,0.41632,0.60562,0.31498,0.50054,0.43294,42.172543,...,228.0,jan-jun,38.0,1.0,76.0,19.0,0.0,0.0000,19.0,"POLYGON ((-73.29697 42.15849, -73.29690 42.158..."
7126,323.0,2020,4.0,1,0.41632,0.60562,0.31498,0.50054,0.43294,42.172543,...,240.0,jan-jun,40.0,1.0,80.0,20.0,0.0,0.0000,20.0,"POLYGON ((-73.29697 42.15849, -73.29690 42.158..."


In [1]:
tracts

NameError: name 'tracts' is not defined