In [1]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
from pandas import IndexSlice as idx
import pickle 
import os
import sklearn
from shapely import wkt


In [2]:
data_dir = os.environ.get('DATA_DIR', '/Users/jyontika/Desktop/cook-county/data/')

gdf_annual = pd.read_csv(f'{data_dir}/cook_county_gdf_cleanwithsvi_year.csv')

#convert to gpd (was having trouble importing csv as gdf)
gdf_annual['geometry'] = gdf_annual['geometry'].apply(wkt.loads)
gdf_annual = gpd.GeoDataFrame(gdf_annual, geometry='geometry')
gdf_annual.crs = {'init': 'EPSG:4269'}
gdf_annual['geoid'] = gdf_annual['geoid'].astype(str) #change to string

data_gdf = gdf_annual

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [3]:
data_gdf.shape

(10624, 20)

In [9]:
# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the columns we could possibly want in the X dataframe
x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']


# These are the columns we could want in the Y dataframe
y_idx_cols = [geography_col, timestep_col, outcome_col]

# These are the features we want
features_only = []
add_spacetime = True
add_svi = True
if add_spacetime:
    features_only += ['lat', 'lon', timestep_col]
if add_svi:
    features_only += ['svi_theme1_pctile', 'svi_theme2_pctile', 'svi_theme3_pctile', 'svi_theme4_pctile', 'svi_total_pctile']

outcomes_only = ['deaths']
info_only = ['year']

In [13]:
multiindexed_gdf.columns

Index(['year_frac', 'deaths', 'year', 'svi_theme1_pctile', 'svi_theme2_pctile',
       'svi_theme3_pctile', 'svi_theme4_pctile', 'svi_total_pctile', 'STATEFP',
       'COUNTYFP', 'TRACTCE', 'NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND',
       'AWATER', 'geometry'],
      dtype='object')

In [10]:
# Create the multiindex
multiindexed_gdf = data_gdf.set_index([geography_col, timestep_col])

## re-add the timestep column as a feature because it's useful
#multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [11]:
multiindexed_gdf.query("geoid == '17031844700'")[['year', 'deaths']]

Unnamed: 0_level_0,Unnamed: 1_level_0,year,deaths
geoid,timestep,Unnamed: 2_level_1,Unnamed: 3_level_1
17031844700,1,2015,0
17031844700,2,2016,2
17031844700,3,2017,5
17031844700,4,2018,6
17031844700,5,2019,3
17031844700,6,2020,5
17031844700,7,2021,4
17031844700,8,2022,1


In [12]:
x_df = multiindexed_gdf[features_only].copy()

KeyError: "['lat', 'lon', 'timestep'] not in index"

In [None]:
y_df = multiindexed_gdf[outcomes_only].copy()

In [8]:
info_df = multiindexed_gdf[info_only].copy()

In [9]:
info_df

Unnamed: 0_level_0,Unnamed: 1_level_0,year
geoid,timestep,Unnamed: 2_level_1
17031010100,2,2015
17031010100,3,2016
17031010100,4,2017
17031010100,5,2018
17031010100,6,2019
...,...,...
17031844700,4,2018
17031844700,5,2019
17031844700,6,2020
17031844700,7,2021


In [10]:


def make_data(x_df, y_df, info_df,
              first_year, last_year,
              window_size_in_tsteps,
              feature_cols,
              lag_in_tsteps=1,
              timesteps_per_year=1,
              year_col='year', timestep_col='timestep', outcome_col='deaths'):
    """Turn a geodataframe into numpy arrays for model fitting

    Args
    ----
    multiindexed_gdf: Geodataframe, must contain a MultiIndex on [geography, time]
    first_year (int): The first year to make predictions for
    last_year (int): The final year to make predictions for, can be the same as first_year
    window_size_in_tsteps (int): How many timesteps of data prior to the prediction year to include
    feature_cols (list[str]): The column names to be included as features
    lag_in_tsteps (int): The number of timesteps between the outcome y and the inputs x. For annual data, simply 1.
        For quarterly data, there could be a 2-4 quarter lag
    timesteps_per_year (int): How many timesteps in a year? 1 for year, 4 for quarter, etc.
    year_col (str): The name of the column containing the year
    timestep_col (str): The neame of the temporal index level
    outcome_col (str): Name of column with outcome variable (deaths) we are trying to predict

    Returns
    -------
    x_BSF
    y_BS
        B is the number of timesteps in the testing
        S is the number of unique locations
        F is the number of features multiplied by the time window.
    """
    W = int(window_size_in_tsteps)
    L = int(lag_in_tsteps)
    new_col_names = ['prev_%s_%02dback' % (outcome_col, W - ww) for ww in range(W)]

    xs = []
    ys = []
    infos = []

    # Iterate over years we want to make predictions for
    for eval_year in range(first_year, last_year + 1):

        timesteps_in_year = info_df[info_df[year_col] == eval_year].index.unique(level=timestep_col).values
        timesteps_in_year = np.unique(timesteps_in_year)
        
        for tt, tstep in enumerate(timesteps_in_year):
            print(tstep)
            # Grab current tstep's history from outcomes at previous tsteps
            xhist_N = y_df.loc[idx[:, tstep-(W+L-1):(tstep-L)], outcome_col].values.copy()
            N = xhist_N.shape[0]
            xhist_MW = xhist_N.reshape((N//W, W))

            x_tt_df = x_df.loc[idx[:, tstep], :].copy()
            y_tt_df = y_df.loc[idx[:, tstep], :].copy()
            info_tt_df = info_df.loc[idx[:, tstep], :].copy()

            print(x_tt_df.shape)
            print(xhist_MW.shape)
            for ww in range(W):
                x_tt_df[new_col_names[ww]] = xhist_MW[:, ww]
                
            xs.append(x_tt_df)
            ys.append(y_tt_df)
            infos.append(info_tt_df)

    return pd.concat(xs), pd.concat(ys), pd.concat(infos)
    '''
    = np.stack(xs, axis=0)
    y_BS = np.stack(ys)

    x_BSTD = tf.convert_to_tensor(x_BSTD, dtype=tf.float32)
    y_BS = tf.convert_to_tensor(y_BS, dtype=tf.float32)

    B, S, T, D = x_BSTD.shape

    assert (B == len(range(first_year, last_year + 1)) * timesteps_per_year)
    assert (S == num_locations)
    assert (T == time_window)
    assert (D == len(feature_cols))

    # Reshape the training data to flatten the dimensions
    x_BSF_flat = tf.reshape(x_BSTD, (B, S, T * D), )

    return x_BSF_flat, y_BS
    '''

In [18]:
info_df[info_df['year'] == 2019].index.unique(level=timestep_col)

Index([6, 5], dtype='int64', name='timestep')

In [21]:
info_df.loc[info_df['year'] == 2019]

Unnamed: 0_level_0,Unnamed: 1_level_0,year
geoid,timestep,Unnamed: 2_level_1
17031010100,6,2019
17031010201,5,2019
17031010202,5,2019
17031010300,5,2019
17031010400,5,2019
...,...,...
17031843700,5,2019
17031843800,6,2019
17031843900,5,2019
17031844600,5,2019


In [29]:
multiindexed_gdf.query("geoid == '17031844700'")[['year', 'deaths']]

Unnamed: 0_level_0,Unnamed: 1_level_0,year,deaths
geoid,timestep,Unnamed: 2_level_1,Unnamed: 3_level_1
17031844700,1,2015,0
17031844700,2,2016,2
17031844700,3,2017,5
17031844700,4,2018,6
17031844700,5,2019,3
17031844700,6,2020,5
17031844700,7,2021,4
17031844700,8,2022,1


In [27]:
info_df.query("geoid == '17031844700'")

Unnamed: 0_level_0,Unnamed: 1_level_0,year
geoid,timestep,Unnamed: 2_level_1
17031844700,1,2015
17031844700,2,2016
17031844700,3,2017
17031844700,4,2018
17031844700,5,2019
17031844700,6,2020
17031844700,7,2021
17031844700,8,2022


In [26]:
info_df.query("geoid == '17031010100'")

Unnamed: 0_level_0,Unnamed: 1_level_0,year
geoid,timestep,Unnamed: 2_level_1
17031010100,2,2015
17031010100,3,2016
17031010100,4,2017
17031010100,5,2018
17031010100,6,2019
17031010100,7,2020
17031010100,8,2021
17031010100,9,2022


In [11]:
x, y, i = make_data(x_df, y_df, info_df, 2019, 2019, 3, features_only)

5
(1328, 8)
(1328, 3)
6
(1328, 8)
(1328, 3)


In [12]:
x.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,timestep,theme_1_pc,theme_2_pc,theme_3_pc,theme_4_pc,svi_pctile,prev_deaths_03back,prev_deaths_02back,prev_deaths_01back
geoid,timestep,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
17031010100,5,42.021255,-87.66983,5,0.6993,0.3324,0.6231,0.9294,0.7652,1,3,2
17031010201,5,42.016008,-87.680148,5,0.7859,0.7549,0.7832,0.9966,0.9681,0,1,1
17031010202,5,42.016048,-87.673326,5,0.9211,0.613,0.7866,0.9991,0.9911,0,2,3
17031010300,5,42.015943,-87.666539,5,0.7681,0.2998,0.6767,0.9757,0.84,1,3,0
17031010400,5,42.006411,-87.658816,5,0.6916,0.1262,0.5161,0.9721,0.711,1,2,1


In [13]:
i.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year
geoid,timestep,Unnamed: 2_level_1
17031010100,5,2018
17031010201,5,2019
17031010202,5,2019
17031010300,5,2019
17031010400,5,2019


In [14]:
i.index

MultiIndex([('17031010100', 5),
            ('17031010201', 5),
            ('17031010202', 5),
            ('17031010300', 5),
            ('17031010400', 5),
            ('17031010501', 5),
            ('17031010502', 5),
            ('17031010503', 5),
            ('17031010600', 5),
            ('17031010701', 5),
            ...
            ('17031843200', 6),
            ('17031843300', 6),
            ('17031843400', 6),
            ('17031843500', 6),
            ('17031843600', 6),
            ('17031843700', 6),
            ('17031843800', 6),
            ('17031843900', 6),
            ('17031844600', 6),
            ('17031844700', 6)],
           names=['geoid', 'timestep'], length=2656)

In [15]:
x.index

MultiIndex([('17031010100', 5),
            ('17031010201', 5),
            ('17031010202', 5),
            ('17031010300', 5),
            ('17031010400', 5),
            ('17031010501', 5),
            ('17031010502', 5),
            ('17031010503', 5),
            ('17031010600', 5),
            ('17031010701', 5),
            ...
            ('17031843200', 6),
            ('17031843300', 6),
            ('17031843400', 6),
            ('17031843500', 6),
            ('17031843600', 6),
            ('17031843700', 6),
            ('17031843800', 6),
            ('17031843900', 6),
            ('17031844600', 6),
            ('17031844700', 6)],
           names=['geoid', 'timestep'], length=2656)