In [13]:
%matplotlib inline
import os
from glob import glob

import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import geopandas as gpd

from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import contextily as cx

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    https://stackoverflow.com/a/4913653/1748679
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [14]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
svi_dir = os.path.join(data_dir,'SocialVulnerabilityIndex')
result_dir = os.path.join(data_dir, 'results_20220606_update')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

In [15]:
svi_file = os.path.join(result_dir,'svi_month')
svi_gdf = gpd.read_file(svi_file)

In [5]:
svi_gdf = svi_gdf.rename(columns={'INTPTLAT':'lat', 'INTPTLON':'lon', 'GEOID':'grid_squar'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf

just_grid = deaths_gdf.loc[(deaths_gdf['year']==2000)&(deaths_gdf['month']==1), ['grid_squar','geometry', 'lat', 'lon']]

# Make lat and lon floats
deaths_gdf.loc[:, 'lat'] = deaths_gdf.lat.astype(float)
deaths_gdf.loc[:, 'lon'] = deaths_gdf.lon.astype(float)

# Calculate each squares neighbors
neighbors = {}
for _, row in just_grid.iterrows():
    just_grid.loc[:,'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                         x['lon'], x['lat']),
                                                  axis=1)
    matching_neighbors = just_grid[just_grid['haversine']<5]['grid_squar'].values
    neighbors[row['grid_squar']]=matching_neighbors
    

tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar','year','month']).sort_index()

month_since_2000 = 0
season_since_2000 = 0
qtr_since_2000 = 0
for year in range(min_year, max_year+1):
    for month in range(1, 12+1):
        
        if month in [1,2,3,4,5,6]:
            season='janjun'
        else:
            season='juldec'
            
        if month <= 3:
            qtr = 1
        elif month <= 6:
            qtr = 2
        elif month <= 9:
            qtr = 3
        else:
            qtr = 4
        
        deaths_gdf.loc[idx[:,year, month], 'month_since_2000'] = month_since_2000
        deaths_gdf.loc[idx[:,year, month], 'season'] = season
        deaths_gdf.loc[idx[:,year, month], 'season_since_2000'] = season_since_2000
        deaths_gdf.loc[idx[:,year, month], 'quarter'] = qtr
        deaths_gdf.loc[idx[:,year, month], 'qtr_since_2000'] = qtr_since_2000
        
        month_since_2000+=1
        
        if month in [6, 12]:
            season_since_2000 += 1
        
        if month in [3,6,9,12]:
            qtr_since_2000 += 1
            
deaths_gdf = deaths_gdf.reset_index()
tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar','year','month']).sort_index()
deaths_gdf.loc[idx[:,:,:],'last_timestep'] = deaths_gdf.loc[idx[:,:,:],'deaths'].shift(1,)
deaths_gdf.loc[idx[:,:,:],'last_year'] = deaths_gdf.loc[idx[:,:,:],'deaths'].shift(12)
deaths_gdf.loc[idx[:,:,:],'delta_deaths'] = deaths_gdf.loc[idx[:,:,:],'deaths'] - deaths_gdf.loc[idx[:,:,:],'last_timestep']
for tract in tracts:
     deaths_gdf.loc[idx[tract,:,:],'neighbors_last_timestep']= \
        deaths_gdf.loc[idx[neighbors[tract],:,:],'last_timestep'].groupby(level=['year', 'month']).mean().shift(1).values
     deaths_gdf.loc[idx[tract,:,:],'neighbors_last_year']= \
        deaths_gdf.loc[idx[neighbors[tract],:,:],'last_year'].groupby(level=['year', 'month']).mean().shift(12).values
    
deaths_gdf = deaths_gdf.reset_index()
deaths_gdf_qtr = deaths_gdf.groupby(['grid_squar','year','qtr_since_2000']).sum(min_count=3)[['deaths','delta_deaths','last_timestep','last_year', 'neighbors_last_timestep','neighbors_last_year']].reset_index()
deaths_gdf_season = deaths_gdf.groupby(['grid_squar','year','season_since_2000']).sum(min_count=6)[['deaths','delta_deaths','last_timestep','last_year', 'neighbors_last_timestep','neighbors_last_year']].reset_index()
deaths_gdf_qtr = deaths_gdf.groupby(['grid_squar','year','quarter','qtr_since_2000']).sum(min_count=3)[['deaths','delta_deaths','last_timestep','last_year', 'neighbors_last_timestep','neighbors_last_year']]
deaths_gdf_qtr.loc[idx[:,:,:,:],'last_timestep'] = deaths_gdf_qtr.loc[idx[:,:,:,:],'deaths'].shift(1,)
deaths_gdf_qtr.loc[idx[:,:,:,:],'last_year'] = deaths_gdf_qtr.loc[idx[:,:,:,:],'deaths'].shift(4)
deaths_gdf_qtr.loc[idx[:,:,:,:],'delta_deaths'] = deaths_gdf_qtr.loc[idx[:,:,:,:],'deaths'] - deaths_gdf_qtr.loc[idx[:,:,:,:],'last_timestep']
for tract in tracts:
     deaths_gdf_qtr.loc[idx[tract,:,:,:],'neighbors_last_timestep']= \
        deaths_gdf_qtr.loc[idx[neighbors[tract],:,:,:],'last_timestep'].groupby(level=['year', 'quarter','qtr_since_2000']).mean().shift(1).values
     deaths_gdf_qtr.loc[idx[tract,:,:,:],'neighbors_last_year']= \
        deaths_gdf_qtr.loc[idx[neighbors[tract],:,:,:],'last_year'].groupby(level=['year','quarter', 'qtr_since_2000']).mean().shift(4).values
    
deaths_gdf_qtr = deaths_gdf_qtr.reset_index()
deaths_gdf_season = deaths_gdf.groupby(['grid_squar','year','season','season_since_2000']).sum(min_count=6)[['deaths','delta_deaths','last_timestep','last_year', 'neighbors_last_timestep','neighbors_last_year']]
deaths_gdf_season.loc[idx[:,:,:,:],'last_timestep'] = deaths_gdf_season.loc[idx[:,:,:,:],'deaths'].shift(1,)
deaths_gdf_season.loc[idx[:,:,:,:],'last_year'] = deaths_gdf_season.loc[idx[:,:,:,:],'deaths'].shift(2)
deaths_gdf_season.loc[idx[:,:,::,],'delta_deaths'] = deaths_gdf_season.loc[idx[:,:,:,:],'deaths'] - deaths_gdf_season.loc[idx[:,:,:],'last_timestep']
for tract in tracts:
     deaths_gdf_season.loc[idx[tract,:,:,:],'neighbors_last_timestep']= \
        deaths_gdf_season.loc[idx[neighbors[tract],:,:,:],'last_timestep'].groupby(level=[ 'season_since_2000']).mean().shift(1).values
     deaths_gdf_season.loc[idx[tract,:,:,:],'neighbors_last_year']= \
        deaths_gdf_season.loc[idx[neighbors[tract],:,:,:],'last_year'].groupby(level=[ 'season_since_2000']).mean().shift(2).values
    
deaths_gdf_season = deaths_gdf_season.reset_index()


In [6]:
def fixed_top_X(true_qtr_val,pred_qtr_val, X=10, denom_all=False):
    
    top_X_predicted = pred_qtr_val.sort_values(ascending=False)[:X]
    top_X_true = true_qtr_val.sort_values(ascending=False)[:X]


    undisputed_top_predicted = top_X_predicted[top_X_predicted>top_X_predicted.min()]
    num_tied_spots = X - len(undisputed_top_predicted)
    undisputed_top_true = top_X_true[top_X_true > top_X_true.min()]
    num_true_ties = X - len(undisputed_top_true)


    tied_top_predicted = pred_qtr_val[pred_qtr_val==top_X_predicted.min()]
    tied_top_true = true_qtr_val[true_qtr_val==top_X_true.min()]

    error_in_top_true_ties = np.abs(tied_top_true-pred_qtr_val[tied_top_true.index]).sort_values(ascending=True)
    error_in_top_pred_ties = np.abs(true_qtr_val[tied_top_predicted.index]-tied_top_predicted).sort_values(ascending=True)
    top_true_tied_geoids = error_in_top_true_ties[:num_true_ties].index
    top_pred_tied_geoids = error_in_top_pred_ties[:num_tied_spots].index

    best_possible_top_true_geoids = pd.Index.union(undisputed_top_true.index, top_true_tied_geoids)
    best_possible_top_pred_geoids = pd.Index.union(undisputed_top_predicted.index, top_pred_tied_geoids)

    # True values of GEOIDS with highest actual deaths. If ties, finds tied locations that match preds best
    best_possible_true = true_qtr_val[best_possible_top_true_geoids]
    best_possible_pred = true_qtr_val[best_possible_top_pred_geoids]

    assert(len(best_possible_true)==X)
    assert(len(best_possible_pred)==X)

    best_possible_absolute = np.abs(best_possible_true.sum() - best_possible_pred.sum())
    best_possible_ratio = np.abs(best_possible_pred).sum()/np.abs(best_possible_true).sum()


    bootstrapped_tied_indices = np.random.choice(tied_top_predicted.index, (1000, num_tied_spots))
    bootstrapped_all_indices =  [pd.Index.union(undisputed_top_predicted.index,
                                                bootstrap_index) for bootstrap_index in bootstrapped_tied_indices]

    bootstrapped_absolute = np.mean([np.abs(top_X_true.sum() - true_qtr_val[indices].sum()) 
                                     for indices in bootstrapped_all_indices])
    bootstrapped_ratio = np.mean([np.abs(true_qtr_val[indices]).sum()/np.abs(top_X_true).sum()
                                  for indices in bootstrapped_all_indices])
    if denom_all:
        bootstrapped_ratio = np.mean([np.abs(true_qtr_val[indices]).sum()/np.abs(true_qtr_val).sum()
                                  for indices in bootstrapped_all_indices])
    
    return best_possible_absolute, best_possible_ratio, bootstrapped_absolute, bootstrapped_ratio

In [7]:
deaths_gdf_qtr

Unnamed: 0,grid_squar,year,quarter,qtr_since_2000,deaths,delta_deaths,last_timestep,last_year,neighbors_last_timestep,neighbors_last_year
0,25001010100,2000,1.0,0.0,0.0,,,,,
1,25001010100,2000,2.0,1.0,0.0,0.0,0.0,,,
2,25001010100,2000,3.0,2.0,0.0,0.0,0.0,,0.00,
3,25001010100,2000,4.0,3.0,0.0,0.0,0.0,,0.00,
4,25001010100,2001,1.0,4.0,3.0,3.0,0.0,0.0,0.00,
...,...,...,...,...,...,...,...,...,...,...
142555,25027761402,2020,4.0,83.0,0.0,0.0,0.0,0.0,0.25,0.25
142556,25027761402,2021,1.0,84.0,1.0,1.0,0.0,0.0,0.00,0.75
142557,25027761402,2021,2.0,85.0,0.0,-1.0,1.0,0.0,0.00,0.25
142558,25027761402,2021,3.0,86.0,0.0,0.0,0.0,0.0,0.50,0.00


In [8]:
deaths_gdf_qtr.groupby('year').sum()['deaths']

year
2000     356.0
2001     443.0
2002     480.0
2003     597.0
2004     467.0
2005     546.0
2006     613.0
2007     598.0
2008     594.0
2009     595.0
2010     524.0
2011     620.0
2012     702.0
2013     897.0
2014    1247.0
2015    1551.0
2016    1895.0
2017    1704.0
2018    1887.0
2019    1837.0
2020    1950.0
2021    1755.0
Name: deaths, dtype: float64

In [16]:
qtr_through_2018 = deaths_gdf_qtr[(~deaths_gdf_qtr['last_timestep'].isna()) &
                                  (deaths_gdf_qtr['year']<2019)]
qtr_2019 = deaths_gdf_qtr[deaths_gdf_qtr['year']==2019]
