In [1]:
%matplotlib inline
import os
from glob import glob

import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import geopandas as gpd

from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import contextily as cx

In [2]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
svi_dir = os.path.join(data_dir,'SocialVulnerabilityIndex')
result_dir = os.path.join(data_dir, 'results')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

In [3]:
svi_file = os.path.join(result_dir,'svi_month')
svi_gdf = gpd.read_file(svi_file)

In [4]:
svi_gdf = svi_gdf.rename(columns={'INTPTLAT':'lat', 'INTPTLON':'lon', 'GEOID':'grid_squar'})
# Make lat and lon floats
svi_gdf.loc[:, 'lat'] = svi_gdf.lat.astype(float)
svi_gdf.loc[:, 'lon'] = svi_gdf.lon.astype(float)
deaths_gdf = svi_gdf

just_grid = deaths_gdf.loc[(deaths_gdf['year']==2000)&(deaths_gdf['month']==1), ['grid_squar','geometry', 'lat', 'lon']]


In [5]:
# Make lat and lon floats
deaths_gdf.loc[:, 'lat'] = deaths_gdf.lat.astype(float)
deaths_gdf.loc[:, 'lon'] = deaths_gdf.lon.astype(float)

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    https://stackoverflow.com/a/4913653/1748679
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [6]:
# Calculate each squares neighbors
neighbors = {}
for _, row in just_grid.iterrows():
    just_grid.loc[:,'haversine'] = just_grid.apply(lambda x: haversine(row['lon'], row['lat'],
                                                                         x['lon'], x['lat']),
                                                  axis=1)
    matching_neighbors = just_grid[just_grid['haversine']<5]['grid_squar'].values
    neighbors[row['grid_squar']]=matching_neighbors

In [7]:

tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar','year','month']).sort_index()

month_since_2000 = 0
season_since_2000 = 0
qtr_since_2000 = 0
for year in range(min_year, max_year+1):
    for month in range(1, 12+1):
        
        if month in [1,2,3,4,5,6]:
            season='janjun'
        else:
            season='juldec'
            
        if month <= 3:
            qtr = 1
        elif month <= 6:
            qtr = 2
        elif month <= 9:
            qtr = 3
        else:
            qtr = 4
        
        deaths_gdf.loc[idx[:,year, month], 'month_since_2000'] = month_since_2000
        deaths_gdf.loc[idx[:,year, month], 'season'] = season
        deaths_gdf.loc[idx[:,year, month], 'season_since_2000'] = season_since_2000
        deaths_gdf.loc[idx[:,year, month], 'quarter'] = qtr
        deaths_gdf.loc[idx[:,year, month], 'qtr_since_2000'] = qtr_since_2000
        
        month_since_2000+=1
        
        if month in [6, 12]:
            season_since_2000 += 1
        
        if month in [3,6,9,12]:
            qtr_since_2000 += 1
        

In [8]:
deaths_gdf = deaths_gdf.reset_index()
tracts = deaths_gdf['grid_squar'].unique()
min_year = deaths_gdf.year.min()
max_year = deaths_gdf.year.max()
deaths_gdf = deaths_gdf.set_index(['grid_squar','year','month']).sort_index()
deaths_gdf.loc[idx[:,:,:],'last_timestep'] = deaths_gdf.loc[idx[:,:,:],'deaths'].shift(1,)
deaths_gdf.loc[idx[:,:,:],'last_year'] = deaths_gdf.loc[idx[:,:,:],'deaths'].shift(12)
deaths_gdf.loc[idx[:,:,:],'delta_deaths'] = deaths_gdf.loc[idx[:,:,:],'deaths'] - deaths_gdf.loc[idx[:,:,:],'last_timestep']
for tract in tracts:
     deaths_gdf.loc[idx[tract,:,:],'neighbors_last_timestep']= \
        deaths_gdf.loc[idx[neighbors[tract],:,:],'last_timestep'].groupby(level=['year', 'month']).mean().shift(1).values
     deaths_gdf.loc[idx[tract,:,:],'neighbors_last_year']= \
        deaths_gdf.loc[idx[neighbors[tract],:,:],'last_year'].groupby(level=['year', 'month']).mean().shift(12).values
    


In [9]:
deaths_gdf_qtr = deaths_gdf.groupby(['grid_squar','year','quarter','qtr_since_2000']).sum(min_count=3)[['deaths','delta_deaths','last_timestep','last_year', 'neighbors_last_timestep','neighbors_last_year']]
deaths_gdf_qtr.loc[idx[:,:,:,:],'last_timestep'] = deaths_gdf_qtr.loc[idx[:,:,:,:],'deaths'].shift(1,)
deaths_gdf_qtr.loc[idx[:,:,:,:],'last_year'] = deaths_gdf_qtr.loc[idx[:,:,:,:],'deaths'].shift(4)
deaths_gdf_qtr.loc[idx[:,:,:,:],'delta_deaths'] = deaths_gdf_qtr.loc[idx[:,:,:,:],'deaths'] - deaths_gdf_qtr.loc[idx[:,:,:,:],'last_timestep']
for tract in tracts:
     deaths_gdf_qtr.loc[idx[tract,:,:,:],'neighbors_last_timestep']= \
        deaths_gdf_qtr.loc[idx[neighbors[tract],:,:,:],'last_timestep'].groupby(level=['year', 'quarter','qtr_since_2000']).mean().shift(1).values
     deaths_gdf_qtr.loc[idx[tract,:,:,:],'neighbors_last_year']= \
        deaths_gdf_qtr.loc[idx[neighbors[tract],:,:,:],'last_year'].groupby(level=['year','quarter', 'qtr_since_2000']).mean().shift(4).values
    
deaths_gdf_qtr = deaths_gdf_qtr.reset_index()

In [10]:
deaths_gdf_season = deaths_gdf.groupby(['grid_squar','year','season','season_since_2000']).sum(min_count=6)[['deaths','delta_deaths','last_timestep','last_year', 'neighbors_last_timestep','neighbors_last_year']]
deaths_gdf_season.loc[idx[:,:,:,:],'last_timestep'] = deaths_gdf_season.loc[idx[:,:,:,:],'deaths'].shift(1,)
deaths_gdf_season.loc[idx[:,:,:,:],'last_year'] = deaths_gdf_season.loc[idx[:,:,:,:],'deaths'].shift(2)
deaths_gdf_season.loc[idx[:,:,::,],'delta_deaths'] = deaths_gdf_season.loc[idx[:,:,:,:],'deaths'] - deaths_gdf_season.loc[idx[:,:,:],'last_timestep']
for tract in tracts:
     deaths_gdf_season.loc[idx[tract,:,:,:],'neighbors_last_timestep']= \
        deaths_gdf_season.loc[idx[neighbors[tract],:,:,:],'last_timestep'].groupby(level=[ 'season_since_2000']).mean().shift(1).values
     deaths_gdf_season.loc[idx[tract,:,:,:],'neighbors_last_year']= \
        deaths_gdf_season.loc[idx[neighbors[tract],:,:,:],'last_year'].groupby(level=[ 'season_since_2000']).mean().shift(2).values
    
deaths_gdf_season = deaths_gdf_season.reset_index()

In [13]:
qtr_2019 = deaths_gdf_qtr[deaths_gdf_qtr['year']==2019]

In [26]:
pct_20p =[]
pct_100 = []
for quarter in range(1,4+1):
    this_month_true = qtr_2019[qtr_2019['quarter']==quarter].set_index('grid_squar')['deaths']
    tot_deaths = this_month_true.sum()
    sorted_vals = this_month_true.sort_values(ascending=False)
    sum_20p = sorted_vals.iloc[:162*2].sum()
    sum_100 = sorted_vals.iloc[:100].sum()
    pct_20p.append(sum_20p/tot_deaths)
    pct_100.append(sum_100/tot_deaths)
    
    print(f'{sum_20p}/{tot_deaths}, {sum_100}/{tot_deaths}')

394.0/443.0, 170.0/443.0
395.0/444.0, 171.0/444.0
411.0/456.0, 187.0/456.0
404.0/494.0, 180.0/494.0


In [30]:
np.mean(pct_100), np.mean(pct_20p)

(0.3858356255996449, 0.8745399283707173)

In [33]:
season_2019 = deaths_gdf_season[deaths_gdf_season['year']==2019]
pct_20p =[]
pct_100 = []
for season in ['janjun','juldec']:
    this_month_true = season_2019[season_2019['season']==season].set_index('grid_squar')['deaths']
    tot_deaths = this_month_true.sum()
    sorted_vals = this_month_true.sort_values(ascending=False)
    sum_20p = sorted_vals.iloc[:162*2].sum()
    sum_100 = sorted_vals.iloc[:100].sum()
    pct_20p.append(sum_20p/tot_deaths)
    pct_100.append(sum_100/tot_deaths)
    
    print(f'{sum_20p}/{tot_deaths}, {sum_100}/{tot_deaths}')

577.0/887.0, 261.0/887.0
620.0/950.0, 278.0/950.0


In [34]:
np.mean(pct_100), np.mean(pct_20p)

(0.29344093039814867, 0.6515694535097609)