In [24]:
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import pandas as pd
import sklearn

In [50]:
%store -r cook_county_gdf_quarterly
%store -r CC_nodeaths
cook_county_gdf = cook_county_gdf_quarterly

In [51]:
#add in missing 58 tracts
cook_county_gdf['deaths'] = 1
cook_county_gdf = pd.concat([cook_county_gdf, CC_nodeaths], ignore_index=True)
cook_county_gdf['quarter'].fillna(9999, inplace=True)
cook_county_gdf['year'].fillna(9999, inplace=True)

In [52]:
cook_county_gdf.loc[cook_county_gdf['year'] == 9999, 'GEOID'].nunique()

58

In [53]:
columns_to_keep = ['STATEFP', 'COUNTYFP', 'TRACTCE','NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER',
       'INTPTLAT', 'INTPTLON', 'geometry']

gdf_quarter = cook_county_gdf.groupby(['GEOID', 'quarter', 'year']).agg({'deaths': 'sum', **{col: 'first' for col in columns_to_keep}}).reset_index()

In [54]:
gdf_quarter.head()

Unnamed: 0,GEOID,quarter,year,deaths,STATEFP,COUNTYFP,TRACTCE,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,17031010100,1.0,2017.0,1,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.66805 42.02216)
1,17031010100,1.0,2020.0,1,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.66639 42.02231)
2,17031010100,1.0,2023.0,3,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.67019 42.02070)
3,17031010100,2.0,2016.0,2,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.67142 42.02143)
4,17031010100,2.0,2019.0,1,17,31,10100,101,Census Tract 101,G5020,S,379511,0,42.0212553,-87.6698301,POINT (-87.66739 42.02232)


### Making DF square - one quarter for each tract


In [55]:
gdf_quarter.loc[gdf_quarter['year'] == 9999, 'deaths'] = 0
gdf_quarter.loc[gdf_quarter['quarter'] == 9999, 'deaths'] = 0

# Replace 9999 with 2014 in the 'year' column
gdf_quarter['year'].replace(9999, 2014, inplace=True)
gdf_quarter['quarter'].replace(9999, 1, inplace=True)

In [56]:
gdf_quarter['GEOID'].nunique() 

1332

In [57]:
##uses nested for loops - not exactly the most efficient 
years = range(2014, 2024)
quarters = range(1, 5)

existing_combinations = set(zip(gdf_quarter['GEOID'], gdf_quarter['year'], gdf_quarter['quarter']))
updated_rows = []

for tract in gdf_quarter['GEOID'].unique():
    for year in years:
        for quarter in quarters:
            combination = (tract, year, quarter)
            if combination not in existing_combinations:
                new_row = {'GEOID': tract, 'year': year, 'quarter': quarter, 'deaths': 0}
                updated_rows.append(new_row)

gdf_quarter = pd.concat([gdf_quarter, pd.DataFrame(updated_rows)], ignore_index=True)


In [59]:
#fills in NAs produced from above
unique_tracts = gdf_quarter['GEOID'].unique()

for tract in unique_tracts:
    tract_rows = gdf_quarter[gdf_quarter['GEOID'] == tract]
    non_na_row = tract_rows.dropna().iloc[0]  # Get the first row without any NAs
    
    for column in columns_to_keep:
        gdf_quarter.loc[(gdf_quarter['GEOID'] == tract) & gdf_quarter[column].isna(), column] = non_na_row[column]

In [60]:
gdf_quarter.shape #should be (53280, 15)


(53280, 16)

In [62]:
gdf_quarter['GEOID'].nunique()  #should be 1332

1332

In [63]:
#confirms all years and quarters present
years = range(2014, 2024) 
quarters = range(1, 5)  

for geoid in gdf_quarter['GEOID'].unique():
    for year in years:
        for quarter in quarters:
            if not ((gdf_quarter['GEOID'] == geoid) & (gdf_quarter['year'] == year) & (gdf_quarter['quarter'] == quarter)).any():
                print("Missing row for GEOID {} in year {} quarter {}".format(geoid, year, quarter))



In [64]:
#Any NAs in data frame?

has_na = gdf_quarter.isna().any().any()

if has_na:
    print("There are missing values in the dataframe.")
else:
    print("There are no missing values in the dataframe.")


There are no missing values in the dataframe.


In [65]:
#Any Duplicates?
has_duplicates = gdf_quarter.duplicated().any()

if has_duplicates:
    print("There are duplicate rows in the dataframe.")
else:
    print("There are no duplicate rows in the dataframe.")


There are no duplicate rows in the dataframe.


In [66]:
#change format of year and quarters

gdf_quarter['year'] = pd.to_datetime(gdf_quarter['year'].astype(int).astype(str), format='%Y').dt.year
gdf_quarter['quarter'] = gdf_quarter['quarter'].astype(str).replace('1.0', '1').replace('2.0', '2').replace('3.0', '3').replace('4.0', '4')


### store gdf

In [68]:
%store gdf_quarter

Stored 'gdf_quarter' (DataFrame)
