In [1]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
from pandas import IndexSlice as idx
import pickle 
import os
import sklearn
from shapely import wkt


In [2]:
data_dir = os.environ.get('DATA_DIR', '/Users/jyontika/Desktop/cook-county/data/')

gdf_annual = pd.read_csv(f'{data_dir}/cook_county_gdf_cleanwithsvi_year.csv')

#convert to gpd (was having trouble importing csv as gdf)
gdf['geometry'] = gdf['geometry'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(gdf, geometry='geometry')
gdf.crs = {'init': 'EPSG:4269'}
gdf['geoid'] = gdf['geoid'].astype(str) #change to string

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [3]:
data_gdf.shape

(10624, 20)

In [9]:
# Name the important columns
timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

# These are the features we want
x_cols_only = []
add_spacetime = True
add_svi = True
if add_spacetime:
    x_cols_only += [timestep_col, 'INTPTLAT', 'INTPTLON']
if add_svi:
    features_only += ['svi_theme1_pctile', 'svi_theme2_pctile', 'svi_theme3_pctile', 'svi_theme4_pctile', 'svi_total_pctile']

y_cols_only = ['deaths']
info_cols_only = ['year', 'year_frac']

In [13]:
multiindexed_gdf.columns

Index(['year_frac', 'deaths', 'year', 'svi_theme1_pctile', 'svi_theme2_pctile',
       'svi_theme3_pctile', 'svi_theme4_pctile', 'svi_total_pctile', 'STATEFP',
       'COUNTYFP', 'TRACTCE', 'NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND',
       'AWATER', 'geometry'],
      dtype='object')

In [10]:
# Create the multiindex
multiindexed_gdf = gdf.set_index([geography_col, timestep_col])

## re-add the timestep column as a feature because it's useful
#multiindexed_gdf[timestep_col] = multiindexed_gdf.index.get_level_values(timestep_col)

# Track number of locations
num_geoids = len(data_gdf[geography_col].unique())

In [11]:
multiindexed_gdf.query("geoid == '17031844700'")[['year', 'deaths']]

Unnamed: 0_level_0,Unnamed: 1_level_0,year,deaths
geoid,timestep,Unnamed: 2_level_1,Unnamed: 3_level_1
17031844700,1,2015,0
17031844700,2,2016,2
17031844700,3,2017,5
17031844700,4,2018,6
17031844700,5,2019,3
17031844700,6,2020,5
17031844700,7,2021,4
17031844700,8,2022,1


In [12]:
x_df = multiindexed_gdf[x_cols_only].copy()

KeyError: "['lat', 'lon', 'timestep'] not in index"

In [None]:
y_df = multiindexed_gdf[y_cols_only].copy()

In [7]:
info_df = multiindexed_gdf[info_cols_only].copy()

In [17]:
def make_x_y_data(x_df, y_df, info_df,
        first_year, last_year,
        window_size_in_tsteps,
        lag_in_tsteps=1,
        timesteps_per_year=1,
        how_to_handle_tstep_without_enough_context='raise_error',
        year_col='year', timestep_col='timestep', outcome_col='deaths'):
    """Turn a geodataframe into numpy arrays for model fitting

    Args
    ----
    multiindexed_gdf: Geodataframe, must contain a MultiIndex on [geography, time]
    first_year (int): The first year to make predictions for
    last_year (int): The final year (inclusive) to make predictions for, can be the same as first_year
    window_size_in_tsteps (int): How many timesteps of data prior to the prediction year to include
    feature_cols (list[str]): The column names to be included as features
    lag_in_tsteps (int): The number of timesteps between the outcome y and the inputs x. For annual data, simply 1.
        For quarterly data, there could be a 2-4 quarter lag
    timesteps_per_year (int): How many timesteps in a year? 1 for year, 4 for quarter, etc.
    year_col (str): The name of the column containing the year
    timestep_col (str): The neame of the temporal index level
    outcome_col (str): Name of column with outcome variable (deaths) we are trying to predict

    Returns
    -------
    x_BSF
    y_BS
        B is the number of timesteps in the testing
        S is the number of unique locations
        F is the number of features multiplied by the time window.
    """
    first_year = int(first_year)
    last_year = int(last_year)
    assert last_year >= first_year
    
    W = int(window_size_in_tsteps)
    L = int(lag_in_tsteps)
    new_col_names = ['prev_%s_%02dback' % (outcome_col, W - ww) for ww in range(W)]

    xs = []
    ys = []
    infos = []

    # Iterate over years we want to make predictions for
    for eval_year in range(first_year, last_year + 1):

        timesteps_in_year = info_df[info_df[year_col] == eval_year].index.unique(level=timestep_col).values
        timesteps_in_year = np.sort(np.unique(timesteps_in_year))
        
        for tt, tstep in enumerate(timesteps_in_year):
            # Make per-tstep dataframes
            x_tt_df = x_df.loc[idx[:, tstep], :].copy()
            y_tt_df = y_df.loc[idx[:, tstep], :].copy()
            info_tt_df = info_df.loc[idx[:, tstep], :].copy()

            # Determine if we can get a full window of 'actual' data, or if we need to zero-pad
            if tstep - (W + L - 1) <= 0:
                if how_to_handle_tstep_without_enough_context == 'raise_error':
                    raise ValueError("Not enough context available for tstep %d. Need at least %d previous tsteps" % (tstep, W+L-1))
                assert how_to_handle_tstep_without_enough_context == 'pad_with_zero'
                WW = tstep - L
            else:
                WW = W
            # Grab current tstep's history from outcomes at previous tsteps
            xhist_N = y_df.loc[idx[:, tstep-(WW+L-1):(tstep-L)], outcome_col].values.copy()
            N = xhist_N.shape[0]
            M = N // WW
            xhist_MW = xhist_N.reshape((M, WW))
            if WW < W:
                xhist_MW = np.hstack([ np.zeros((M, W-WW)), xhist_MW])
            assert xhist_MW.shape[1] == W
            for ww in range(W):
                x_tt_df[new_col_names[ww]] = xhist_MW[:, ww]
                
            xs.append(x_tt_df)
            ys.append(y_tt_df)
            infos.append(info_tt_df)

    return pd.concat(xs), pd.concat(ys), pd.concat(infos)

In [20]:
tr_x_df, tr_y_df, tr_i_df = make_x_y_data(x_df, y_df, info_df, 2017, 2019, 3)
va_x_df, va_y_df, va_i_df = make_x_y_data(x_df, y_df, info_df, 2020, 2020, 3)
te_x_df, te_y_df, te_i_df = make_x_y_data(x_df, y_df, info_df, 2021, 2022, 3)

In [21]:
tr_x_df.shape

(15936, 11)

In [22]:
va_x_df.shape

(5312, 11)

In [23]:
te_x_df.shape, te_i_df.shape

((10624, 11), (10624, 2))

In [24]:
prev_cols = tr_x_df.columns[len(x_cols_only):]

In [25]:
tr_x_df[prev_cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,prev_deaths_03back,prev_deaths_02back,prev_deaths_01back
geoid,timestep,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17031010100,9,2,0,1
17031010201,9,0,0,0
17031010202,9,0,0,0
17031010300,9,1,0,0
17031010400,9,0,0,1
...,...,...,...,...
17031843700,20,0,0,0
17031843800,20,0,0,1
17031843900,20,0,0,1
17031844600,20,0,0,0
