In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import pandas as pd
import sklearn

In [2]:
%store -r cook_county_gdf_semiannual
%store -r CC_nodeaths
cook_county_gdf = cook_county_gdf_semiannual

In [3]:
#add in missing 58 tracts
cook_county_gdf['deaths'] = 1
cook_county_gdf = pd.concat([cook_county_gdf, CC_nodeaths], ignore_index=True)
cook_county_gdf['semiannual'].fillna(9999, inplace=True) 
cook_county_gdf['year'].fillna(9999, inplace=True)

In [4]:
cook_county_gdf.loc[cook_county_gdf['year'] == 9999, 'GEOID'].nunique() #should be 58

58

In [5]:
columns_to_keep = ['STATEFP', 'COUNTYFP', 'TRACTCE','NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER',
       'INTPTLAT', 'INTPTLON', 'geometry']

gdf_semi = cook_county_gdf.groupby(['GEOID', 'semiannual', 'year']).agg({'deaths': 'sum', **{col: 'first' for col in columns_to_keep}}).reset_index()

In [6]:
gdf_semi.head()

Unnamed: 0,GEOID,semiannual,year,deaths,STATEFP,COUNTYFP,TRACTCE,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,17031010100,1.0,2016.0,2,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.67142 42.02143)
1,17031010100,1.0,2017.0,1,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.66805 42.02216)
2,17031010100,1.0,2019.0,1,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.66739 42.02232)
3,17031010100,1.0,2020.0,2,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.66639 42.02231)
4,17031010100,1.0,2022.0,1,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.66696 42.02057)


### Making DF square - one "semi-annual" period for each tract

In [7]:
gdf_semi.loc[gdf_semi['year'] == 9999, 'deaths'] = 0
gdf_semi.loc[gdf_semi['semiannual'] == 9999, 'deaths'] = 0

# Replace 9999 with 2014 in the 'year' column
gdf_semi['year'].replace(9999, 2014, inplace=True)
gdf_semi['semiannual'].replace(9999, 1, inplace=True)

In [8]:
gdf_semi['GEOID'].nunique()  #should be 1332

1332

In [9]:
##nested for loops
years = range(2014, 2024)
periods = [1, 2]

existing_combinations = set(zip(gdf_semi['GEOID'], gdf_semi['year'], gdf_semi['semiannual']))
updated_rows = []

for tract in gdf_semi['GEOID'].unique():
    for year in years:
        for period in periods:
            combination = (tract, year, period)
            if combination not in existing_combinations:
                new_row = {'GEOID': tract, 'year': year, 'semiannual': period, 'deaths': 0}
                updated_rows.append(new_row)

gdf_semi = pd.concat([gdf_semi, pd.DataFrame(updated_rows)], ignore_index=True)

In [10]:
#fills in NAs produced from above
unique_tracts = gdf_semi['GEOID'].unique()

for tract in unique_tracts:
    tract_rows = gdf_semi[gdf_semi['GEOID'] == tract]
    non_na_row = tract_rows.dropna().iloc[0]  # Get the first row without any NAs
    
    for column in columns_to_keep:
        gdf_semi.loc[(gdf_semi['GEOID'] == tract) & gdf_semi[column].isna(), column] = non_na_row[column]

In [11]:
gdf_semi.shape #should be (26640, 15)


(26640, 16)

In [12]:
#confirms all years and periods present
years = range(2014, 2024)  
periods = [1, 2]  

for tract in gdf_semi['GEOID'].unique():
    for year in years:
        for period in periods:
            if not ((gdf_semi['GEOID'] == tract) & (gdf_semi['year'] == year) & (gdf_semi['semiannual'] == period)).any():
                print("Missing row for GEOID {} in year {} semiannual {}".format(tract, year, period))


In [33]:
#Any NAs in data frame?

has_na = gdf_semi.isna().any().any()

if has_na:
    print("There are missing values in the dataframe.")
else:
    print("There are no missing values in the dataframe.")


There are no missing values in the dataframe.


In [34]:
#Any Duplicates?
has_duplicates = gdf_semi.duplicated().any()

if has_duplicates:
    print("There are duplicate rows in the dataframe.")
else:
    print("There are no duplicate rows in the dataframe.")


There are no duplicate rows in the dataframe.


In [35]:
#change format of year and quarters

gdf_semi['year'] = pd.to_datetime(gdf_semi['year'].astype(int).astype(str), format='%Y').dt.year
gdf_semi['semiannual'] = gdf_semi['semiannual'].astype(str).replace('1.0', '1').replace('2.0', '2')


In [36]:
gdf_semi.columns = ['GEOID', 'semiannual', 'year', 'deaths', 'STATEFP', 'COUNTYFP',
       'TRACTCE', 'NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER',
       'lat', 'lon', 'geometry']

ValueError: Length mismatch: Expected axis has 18 elements, new values have 16 elements

In [37]:
#timestep column
gdf_semi = gdf_semi.sort_values(by=['GEOID', 'year', 'semiannual'])
gdf_semi['timestep'] = gdf_semi.groupby(['GEOID']).cumcount() + 1

#season column
gdf_semi = gdf_semi.sort_values(by=['GEOID', 'year', 'semiannual'])

def map_season(semiannual):
    return 'jan-jun' if semiannual == '1' else 'jul-dec'

gdf_semi['season'] = gdf_semi['semiannual'].map(map_season)

### store gdf


In [40]:
gdf_semi = gdf_semi[gdf_semi['year'] != 2023]
%store gdf_semi

Stored 'gdf_semi' (DataFrame)
