In [1]:
%matplotlib inline
#  disable transparency for gif making
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
import os
from glob import glob

import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.ticker import MaxNLocator

import geopandas as gpd

from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import contextily as cx

In [2]:
data_dir = '/cluster/tufts/hugheslab/datasets/NSF_OD/'
svi_dir = os.path.join(data_dir,'SocialVulnerabilityIndex')
result_dir = os.path.join(data_dir, 'results')
mass_shapefile = os.path.join(data_dir,'shapefiles','MA_2021')

In [19]:
matched_df = pd.read_csv(os.path.join(result_dir,'geocoded_deaths_2000_2020.csv')) 
matched_df['dod_dt'] = pd.to_datetime(matched_df['dod_dt'])
matched_df['month'] = matched_df['dod_dt'].dt.month

In [20]:
mass_gdf = gpd.read_file(mass_shapefile)
matched_df.loc[:,'tract'] = matched_df['tract'].astype(int)
mass_gdf.loc[:,'TRACTCE'] = mass_gdf['TRACTCE'].astype(int)

In [22]:
deaths_per_tract_df = matched_df.groupby(['year','month','tract']).size().reset_index(name='deaths')

In [23]:
no = 0
for tract in deaths_per_tract_df.tract.unique():
    if tract not in mass_gdf.TRACTCE.unique():
        raise ValueError('Failed to match a tract!')

In [25]:
deaths_gdf = gpd.GeoDataFrame()
for year in deaths_per_tract_df.year.unique():
    for month in deaths_per_tract_df.month.unique():
        these_deaths = deaths_per_tract_df[(deaths_per_tract_df['year']==year) & (deaths_per_tract_df['month']==month)]
        years_merged_deaths = mass_gdf.merge(these_deaths,
                                             left_on='TRACTCE',
                                             right_on='tract',
                                             how='left')
        # fill NAs
        years_merged_deaths.loc[:,'year']=year
        years_merged_deaths.loc[:,'month']=month
        years_merged_deaths.loc[:,'deaths'] = years_merged_deaths.loc[:,'deaths'].fillna(0)
        deaths_gdf = deaths_gdf.append(years_merged_deaths)

In [33]:
deaths_file = os.path.join(result_dir,'res_deaths_month_all')
deaths_gdf.to_file(deaths_file)
matched_file = os.path.join(result_dir,'geocoded_deaths_monthly.csv')
matched_df.to_csv(matched_file, index=False)

In [42]:
matched_df.groupby(['year','month','tract']).size()

year  month  tract 
2000  1      70501     1
             170502    1
             205600    1
             250200    1
             250400    1
                      ..
2021  12     40400     1
             206000    1
             252201    1
             652000    1
             900100    1
Length: 20603, dtype: int64

In [45]:
deaths_per_tract_df.max()

year        2021
month         12
tract     985600
deaths         8
dtype: int64