In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import fidap
from config import api_key

# instantiate api connection
fidap = fidap.fidap_client(api_key=api_key)

### Social Determinants of Health  

This document aims to create a Minimum Viable Product that documents the constituent components of Social Determinants of Health (SDOH). SDOH is broadly conceptualized as comprised of measurements of:  
1. Crime Levels  
2. Educational Attainment  
3. Retail-Grocery Gap  
4. Environmental Factors  
5. Personal Infrastructure  
6. Climate Change  
7. Family  

What we are trying to achieve here is to approximate what the open-data that are used to derive SDOH might look like.  
  
#### Minimum Viable Product (MVP)  
  
This MVP proposed will likely be able to take into account all of the 6 of the 7 factors listed above. Climate Change and Environmental Factors will be amalgamated into one. For the purpose of this MVP, we will primarily gather data on the spatial scale of zip codes, relating to Chicago IL in 2018. [Chicago's zip codes](https://www.chicago.gov/content/dam/city/sites/covid/reports/2020-04-24/ChicagoCommunityAreaandZipcodeMap.pdf) also have the distinctive feature of starting with 606. In fact all zip-codes starting with 606 relate to Chicago.   
  
A lot of the data can be obtained at the zip-code level. Detail and data at the scale of census blocks might not be easily available all the time. It is therefore much preferable to aggregate at the scale of zip codes. Zip codes also have the benefit of being universally understood, as opposed to rather esoteric FIPs codes of Census blocks or Census tracts. However there are multiple problems with using zip codes, but we can address them in future iterations of this product. For the moment, the combined use of ZCTAs from the Census Bureau and Zip Codes are stable enough to provide an indicative MVP.   
  
The common identifier that can be used to connect all datasets will be zip code.  

#### Chicago's Zip Codes  
  
So the idea here is to get the boundary of each zip code in Chicago, IL, as well as the area of each zip code in square miles.   

In [2]:
chicago_zip_query = fidap.sql("""
SELECT zip_code, zip_code_geom, (ROUND(ST_AREA(zip_code_geom)/2589988.1103,3)) AS zip_area_sqm
FROM bigquery-public-data.geo_us_boundaries.zip_codes
WHERE zip_code LIKE '606%'
""")

In [3]:
chicago_zip_query.head(n = 5)

Unnamed: 0,zip_code,zip_code_geom,zip_area_sqm
0,60628,"POLYGON((-87.64419 41.683236, -87.644347 41.68...",10.979
1,60652,"POLYGON((-87.742055 41.771129, -87.741973 41.7...",5.007
2,60611,"POLYGON((-87.628628 41.90318, -87.628358 41.90...",1.403
3,60606,"POLYGON((-87.641378 41.888632, -87.641374 41.8...",0.255
4,60624,"POLYGON((-87.741203 41.895093, -87.741186 41.8...",3.535


#### Family  
  
We define family broadly as basic demographic information such as the breakdown of the population by race, age group, gender, and household information.

In [4]:
age_structure_query = fidap.sql("""
SELECT geo_id AS zip, total_pop,male_pop,female_pop, median_age, male_under_5,male_5_to_9,male_10_to_14, (male_15_to_17+male_18_to_19) AS male_15_to_19, (male_20+male_21+male_22_to_24) AS male_20_to_24 ,male_25_to_29,male_30_to_34,male_35_to_39,male_40_to_44,male_45_to_49,male_50_to_54,male_55_to_59,(male_60_to_61+male_62_to_64) AS male_60_to_64, (male_65_to_66+male_67_to_69+male_70_to_74+male_75_to_79+male_80_to_84+male_85_and_over) AS male_65_and_over,female_under_5,female_5_to_9,female_10_to_14,(female_15_to_17+female_18_to_19) AS female_15_to_19, (female_20 + female_21 + female_22_to_24) AS female_20_to_24,female_25_to_29,female_30_to_34,female_35_to_39,female_40_to_44,female_45_to_49,female_50_to_54,female_55_to_59,(female_60_to_61+female_62_to_64) AS female_60_to_64,(female_65_to_66+female_67_to_69+female_70_to_74+female_75_to_79+female_80_to_84+female_85_and_over) AS female_65_and_over 
FROM bigquery-public-data.census_bureau_acs.zcta5_2018_5yr
WHERE geo_id LIKE '606%';
""")

In [5]:
age_structure_query.head(n = 5)

Unnamed: 0,zip,total_pop,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,male_15_to_19,male_20_to_24,male_25_to_29,male_30_to_34,male_35_to_39,male_40_to_44,male_45_to_49,male_50_to_54,male_55_to_59,male_60_to_64,male_65_and_over,female_under_5,female_5_to_9,female_10_to_14,female_15_to_19,female_20_to_24,female_25_to_29,female_30_to_34,female_35_to_39,female_40_to_44,female_45_to_49,female_50_to_54,female_55_to_59,female_60_to_64,female_65_and_over
0,60604,782,369,413,32.4,6,0,7,35,23,43,64,6,9,11,23,32,54,56,6,0,0,17,145,59,30,4,15,16,24,22,38,37
1,60602,1244,693,551,30.6,5,43,44,7,64,158,117,151,47,0,47,0,5,5,56,0,0,11,27,169,139,55,42,46,6,0,0,0
2,60601,14675,7191,7484,34.9,228,45,0,176,348,1607,1141,608,452,442,802,196,422,724,322,111,0,731,561,1297,818,225,762,534,207,128,437,1351
3,60656,27579,13992,13587,40.5,756,732,811,519,546,1315,1164,1246,1014,976,1049,1027,972,1865,941,545,488,427,661,896,1340,1184,802,831,746,983,971,2772
4,60622,52793,27216,25577,31.7,1858,1059,797,637,1492,5567,5169,2824,2187,1326,1171,862,714,1553,1682,946,703,966,1828,5186,4621,2838,1609,1141,903,847,741,1566


The `age_structure_query` looks at the demographic structure of each zip code. Each zip code is broken down into its total population, total male and female populations, median age, as well as the population for each 5-year age bracket, with an upper limit of 65 and separated by sex. If what we want is a standardized dataset, then perhaps we can express all of the numbers as a percentage of each zip code's total. 

In [16]:
age_structure_pct = age_structure_query.copy().assign(
    males_pop_pct = lambda x: round(100*(x.male_pop/x.total_pop),2),
    females_pop_pct = lambda x: round(100*(x.female_pop/x.total_pop),2),
    males_under_15_pct = lambda x: round(100*((x.male_under_5+x.male_5_to_9+x.male_10_to_14)/x.total_pop),2),
    males_above_65_pct = lambda x: round(100*(x.male_65_and_over/x.total_pop),2),
    males_15_19_pct = lambda x: round(100*(x.male_15_to_19/x.total_pop),2),
    males_20_24_pct = lambda x: round(100*(x.male_20_to_24/x.total_pop),2),
    males_25_29_pct = lambda x: round(100*(x.male_25_to_29/x.total_pop),2),
    males_30_34_pct = lambda x: round(100*(x.male_30_to_34/x.total_pop),2),
    males_35_39_pct = lambda x: round(100*(x.male_35_to_39/x.total_pop),2),
    males_40_44_pct = lambda x: round(100*(x.male_40_to_44/x.total_pop),2),
    males_45_49_pct = lambda x: round(100*(x.male_45_to_49/x.total_pop),2),
    males_55_59_pct = lambda x: round(100*(x.male_55_to_59/x.total_pop),2),
    males_60_64_pct = lambda x: round(100*(x.male_60_to_64/x.total_pop),2),
    females_under_15_pct = lambda x: round(100*((x.female_under_5+x.female_5_to_9+x.female_10_to_14)/x.total_pop),2),
    females_above_65_pct = lambda x: round(100*(x.female_65_and_over/x.total_pop),2),
    females_15_19_pct = lambda x: round(100*(x.female_15_to_19/x.total_pop),2),
    females_20_24_pct = lambda x: round(100*(x.female_20_to_24/x.total_pop),2),
    females_25_29_pct = lambda x: round(100*(x.female_25_to_29/x.total_pop),2),
    females_30_34_pct = lambda x: round(100*(x.female_30_to_34/x.total_pop),2),
    females_35_39_pct = lambda x: round(100*(x.female_35_to_39/x.total_pop),2),
    females_40_44_pct = lambda x: round(100*(x.female_40_to_44/x.total_pop),2),
    females_45_49_pct = lambda x: round(100*(x.female_45_to_49/x.total_pop),2),
    females_55_59_pct = lambda x: round(100*(x.female_55_to_59/x.total_pop),2),
    females_60_64_pct = lambda x: round(100*(x.female_60_to_64/x.total_pop),2)
)

age_structure_pct_cols = ["zip","total_pop","male_pop","female_pop","median_age",'males_pop_pct', 'females_pop_pct', 'males_under_15_pct',
       'males_above_65_pct', 'males_15_19_pct', 'males_20_24_pct',
       'males_25_29_pct', 'males_30_34_pct', 'males_35_39_pct',
       'males_40_44_pct', 'males_45_49_pct', 'males_55_59_pct',
       'males_60_64_pct', 'females_under_15_pct', 'females_above_65_pct',
       'females_15_19_pct', 'females_20_24_pct', 'females_25_29_pct',
       'females_30_34_pct', 'females_35_39_pct', 'females_40_44_pct',
       'females_45_49_pct', 'females_55_59_pct', 'females_60_64_pct']
age_structure_pct = age_structure_pct.loc[:,age_structure_pct_cols]

In [18]:
age_structure_pct.head(n=5)

Unnamed: 0,zip,total_pop,male_pop,female_pop,median_age,males_pop_pct,females_pop_pct,males_under_15_pct,males_above_65_pct,males_15_19_pct,males_20_24_pct,males_25_29_pct,males_30_34_pct,males_35_39_pct,males_40_44_pct,males_45_49_pct,males_55_59_pct,males_60_64_pct,females_under_15_pct,females_above_65_pct,females_15_19_pct,females_20_24_pct,females_25_29_pct,females_30_34_pct,females_35_39_pct,females_40_44_pct,females_45_49_pct,females_55_59_pct,females_60_64_pct
0,60604,782,369,413,32.4,47.19,52.81,1.66,7.16,4.48,2.94,5.5,8.18,0.77,1.15,1.41,4.09,6.91,0.77,4.73,2.17,18.54,7.54,3.84,0.51,1.92,2.05,2.81,4.86
1,60602,1244,693,551,30.6,55.71,44.29,7.4,0.4,0.56,5.14,12.7,9.41,12.14,3.78,0.0,0.0,0.4,4.5,0.0,0.88,2.17,13.59,11.17,4.42,3.38,3.7,0.0,0.0
2,60601,14675,7191,7484,34.9,49.0,51.0,1.86,4.93,1.2,2.37,10.95,7.78,4.14,3.08,3.01,1.34,2.88,2.95,9.21,4.98,3.82,8.84,5.57,1.53,5.19,3.64,0.87,2.98
3,60656,27579,13992,13587,40.5,50.73,49.27,8.34,6.76,1.88,1.98,4.77,4.22,4.52,3.68,3.54,3.72,3.52,7.16,10.05,1.55,2.4,3.25,4.86,4.29,2.91,3.01,3.56,3.52
4,60622,52793,27216,25577,31.7,51.55,48.45,7.04,2.94,1.21,2.83,10.54,9.79,5.35,4.14,2.51,1.63,1.35,6.31,2.97,1.83,3.46,9.82,8.75,5.38,3.05,2.16,1.6,1.4


In [19]:
family_structure_query = fidap.sql("""
SELECT geo_id AS zip, households, married_households, (households-married_households) AS unmarried_households,
    nonfamily_households, family_households, (family_households - married_households) AS family_unmarried_households, 
    households_public_asst_or_food_stamps
FROM bigquery-public-data.census_bureau_acs.zcta5_2018_5yr
WHERE geo_id LIKE '606%';
""")

In [20]:
family_structure_query.head(n=5)

Unnamed: 0,zip,households,married_households,unmarried_households,nonfamily_households,family_households,family_unmarried_households,households_public_asst_or_food_stamps
0,60604,479,145,334,318,161,16,0
1,60602,602,420,182,182,420,0,0
2,60601,8864,2974,5890,5764,3100,126,49
3,60656,12088,5803,6285,4732,7356,1553,1058
4,60622,23410,6683,16727,13974,9436,2753,2288


The `family_structure_query` looks at the structure of households in each zip code in terms of the number of households.  Of the total number of households, how many of them are married, and unmarried.  
  
There is also another way to look at the households, which is by family unit. That is, the number of households that are family and non-family. Married households is a subset of family households; in other words, we can also derive the number of unmarried family households by subtracting the number of married households from the number of family households.  
  
The third dimension of looking at household structure is through a socio-economic lens - the number of households that require public assistance and/or on food stamps. This can be used as a proxy for the prevalence of poverty. 

In [22]:
race_query = fidap.sql("""
SELECT geo_id AS zip, total_pop, black_pop, asian_pop, hispanic_pop, amerindian_pop, other_race_pop, white_pop
FROM bigquery-public-data.census_bureau_acs.zcta5_2018_5yr
WHERE geo_id LIKE '606%';
""")

In [23]:
race_query.head(n=5)

Unnamed: 0,zip,total_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,white_pop
0,60604,782,37,232,34,0,0,479
1,60602,1244,47,242,81,65,0,788
2,60601,14675,767,2641,1274,66,0,9677
3,60656,27579,325,2013,3382,0,148,21053
4,60622,52793,3230,2253,11579,32,139,34144


In [24]:
race_query_pct = race_query.assign(
    black_pct = lambda z: round(100*z.black_pop/z.total_pop,2),
    asian_pct = lambda z: round(100*z.asian_pop/z.total_pop,2),
    hispanic_pct = lambda z: round(100*z.hispanic_pop/z.total_pop,2),
    amerindian_pct = lambda z: round(100*z.amerindian_pop/z.total_pop,2),
    other_race_pct = lambda z: round(100*z.other_race_pop/z.total_pop,2),
    white_pct = lambda z: round(100*z.white_pop/z.total_pop,2),
)
race_query_pct = race_query_pct.loc[:, ['zip', 'total_pop','black_pct', 'asian_pct', 'hispanic_pct', 'amerindian_pct', 'other_race_pct', 'white_pct']]

In [25]:
race_query_pct.head(n = 5)

Unnamed: 0,zip,total_pop,black_pct,asian_pct,hispanic_pct,amerindian_pct,other_race_pct,white_pct
0,60604,782,4.73,29.67,4.35,0.0,0.0,61.25
1,60602,1244,3.78,19.45,6.51,5.23,0.0,63.34
2,60601,14675,5.23,18.0,8.68,0.45,0.0,65.94
3,60656,27579,1.18,7.3,12.26,0.0,0.54,76.34
4,60622,52793,6.12,4.27,21.93,0.06,0.26,64.68


Finally, we can also look at the concept of family through each zip code's racial breakdown, which we showcase here in `race_query` and `race_query_pct`, the latter of which shows very clearly the persistence of ghettos in Chicago. 

#### Education  
  
Education can be defined in terms of educational attainment of the population.  
  
Counting the number of educational establishments within each zip code is another way to do this, but it does not directly affect the population in its surrounding areas as they might not make use of them. Not everyone who lives around UChicago enjoys the benefit of a UChicago education. But is obviously more true at other levels of education such as K-12 as most children attend schools near their place of residence. Then, the availability of educational opportunities matter.    

In [26]:
educational_attainment_query = fidap.sql("""
SELECT geo_id AS zip, total_pop, pop_25_years_over,
    high_school_diploma, less_one_year_college, some_college_and_associates_degree, 
    associates_degree, bachelors_degree,
    masters_degree, graduate_professional_degree 
FROM bigquery-public-data.census_bureau_acs.zcta5_2018_5yr
WHERE geo_id LIKE '606%';
""")

educational_attainment_query.head(n=5)

Unnamed: 0,zip,total_pop,pop_25_years_over,high_school_diploma,less_one_year_college,some_college_and_associates_degree,associates_degree,bachelors_degree,masters_degree,graduate_professional_degree
0,60604,782,543,18,0,52,35,142,240,331
1,60602,1244,987,0,5,51,0,578,150,358
2,60601,14675,12153,320,406,1681,379,4690,3627,5428
3,60656,27579,21153,4806,1122,5896,1562,5668,1952,2421
4,60622,52793,40825,3352,950,5877,1493,17957,7025,10292


The `educational_attainment_query` looks at the number of people in each zip code and their highest educational attainment. 

#### Retail Grocery Gap  
  
What we want to measure here is the availability of fresh food. We want to see whether the distribution of supermarkets in each zip code is equitable. To this end, we will first like to obtain a list of supermarkets in Chicago, IL, and then group them by zip code. 

In [31]:
supermarket_query = fidap.sql("""
WITH bounding_area AS (SELECT geometry FROM bigquery-public-data.geo_openstreetmap.planet_features_multipolygons
WHERE ('name:en', 'Chicago') IN (SELECT(key, value) FROM UNNEST(all_tags))
AND ('boundary', 'administrative') IN (SELECT(key, value) FROM UNNEST(all_tags))
AND ('admin_level', '8') IN  (SELECT(key, value) FROM UNNEST(all_tags))
)
SELECT pt.geometry, tags.value AS tags, tags.key AS keys
FROM bigquery-public-data.geo_openstreetmap.planet_features_points AS pt, bounding_area
JOIN UNNEST(all_tags) AS tags
WHERE (tags.key = 'name' OR tags.key = 'addr:postcode')
AND ('shop', 'supermarket') IN (SELECT(key, value) FROM UNNEST(all_tags))
AND ST_WITHIN(pt.geometry, bounding_area.geometry)
""")

In [33]:
# pivoting the table  
supermarket_df = supermarket_query.pivot(index = 'geometry', columns = 'keys', values = 'tags').reset_index()
supermarket_df = supermarket_df.rename(columns = {'addr:postcode' : 'zip_code'})

# counting the number of supermarkets by zip code
supermarket_df.name = supermarket_df.name.fillna('Unknown')
supermarket_zip_df = supermarket_df.groupby('zip_code').agg('count').drop('geometry', axis = 1).reset_index()
supermarket_zip_df.zip_code = supermarket_zip_df.zip_code.astype(int)

# left join the number of supermarkets to number of zip codes
# provide us with the number of zip codes without a supermarket
supermarket_zip_gdf = chicago_zip_query.merge(supermarket_zip_df, 'left', 'zip_code')
supermarket_zip_gdf.name = supermarket_zip_gdf.name.fillna(0)
supermarket_zip_gdf = supermarket_zip_gdf.rename(columns = {'name' : 'count'})

# look at the per-capita availability of supermarkets 
pop_zip_code = race_query.loc[:, ['zip', 'total_pop']]
pop_zip_code = pop_zip_code.assign(
    pop_10ks = pop_zip_code.total_pop/10000
)
supermarket_zip_gdf = supermarket_zip_gdf.merge(pop_zip_code, left_on = 'zip_code', right_on = 'zip')
supermarket_zip_gdf = supermarket_zip_gdf.assign(
    per_capita_supermarket = lambda x: x['count']/x['pop_10ks']
)

# calling head on the dataset
supermarket_zip_gdf.head(n=5)

Unnamed: 0,zip_code,zip_code_geom,zip_area_sqm,count,zip,total_pop,pop_10ks,per_capita_supermarket
0,60628,"POLYGON((-87.64419 41.683236, -87.644347 41.68...",10.979,0.0,60628,66724,6.6724,0.0
1,60652,"POLYGON((-87.742055 41.771129, -87.741973 41.7...",5.007,0.0,60652,43907,4.3907,0.0
2,60611,"POLYGON((-87.628628 41.90318, -87.628358 41.90...",1.403,3.0,60611,32426,3.2426,0.925183
3,60606,"POLYGON((-87.641378 41.888632, -87.641374 41.8...",0.255,0.0,60606,3101,0.3101,0.0
4,60624,"POLYGON((-87.741203 41.895093, -87.741186 41.8...",3.535,0.0,60624,36158,3.6158,0.0


With this query, we are able to obtain the zip codes that do not have a supermarket while counting the number of supermarkets per 10000 inhabitants in the zip codes that do have a supermarket. 

#### Crime Levels  
  
To account for the impact of crime, I pulled data from Chicago's crime database corresponding to the year 2018 because our ACS data dates from then.   

In [29]:
crime_query = fidap.sql("""
WITH chicago_zip AS(
SELECT zip_code, zip_code_geom 
FROM bigquery-public-data.geo_us_boundaries.zip_codes
WHERE zip_code LIKE '606%'
)

SELECT COUNT(*) AS num_cases, z.zip_code 
FROM bigquery-public-data.chicago_crime.crime AS c, chicago_zip AS z
WHERE c.year = 2018
AND c.location IS NOT NULL
AND ST_CONTAINS(z.zip_code_geom, ST_GEOGPOINT(c.longitude, c.latitude))
GROUP BY z.zip_code;
""")

In [30]:
crime_query.head(n=5)

Unnamed: 0,num_cases,zip_code
0,4115,60640
1,6453,60609
2,9996,60619
3,784,60646
4,7713,60649


#### Personal Infrastructure  
  
Our definition of personal infrastructure refers to the quality of the housing stock, as well as public transit availability.   

In [37]:
housing_stock_query = fidap.sql("""
SELECT acs.geo_id AS zip_code, acs.median_year_structure_built, acs.percent_income_spent_on_rent, (acs.total_pop/acs.housing_units) AS housing_density
FROM bigquery-public-data.census_bureau_acs.zcta5_2018_5yr AS acs
WHERE acs.geo_id LIKE '606%'
""")

# calling head to inspect query
housing_stock_query.head(n=5)

Unnamed: 0,zip_code,median_year_structure_built,percent_income_spent_on_rent,housing_density
0,60604,0,19.3,1.305509
1,60602,2006,30.6,1.907975
2,60601,1990,27.2,1.347814
3,60656,1962,27.3,2.150409
4,60622,0,24.0,2.049497


Here, we look at the number of quality of buildings in a zip code as approximated by the median age of the structure. 

In [36]:
personal_transportation_query = fidap.sql("""
WITH chicago_zip AS(
SELECT zip_code, zip_code_geom 
FROM bigquery-public-data.geo_us_boundaries.zip_codes
WHERE zip_code LIKE '606%'
)

SELECT COUNT(*) AS count_transit_stops, z.zip_code
FROM bigquery-public-data.geo_openstreetmap.planet_features_points AS p, chicago_zip as z
JOIN UNNEST(all_tags) AS tags
WHERE tags.key = 'name'
AND ('operator', 'Chicago Transit Authority') IN (SELECT(key, value) FROM UNNEST(all_tags))
AND ST_CONTAINS(z.zip_code_geom, p.geometry)
GROUP BY z.zip_code;
""")

personal_transportation_query.head(n=5)

Unnamed: 0,count_transit_stops,zip_code
0,144,60626
1,260,60638
2,171,60610
3,164,60605
4,533,60637


Alternatively, we can also look at the location of public transit stops, unpack that number by counting the number of such stops in each zip code.

#### Climate and Environment  
  
In terms of the climate and the environment, we can look at it from the perspective of air quality (PM2.5) obtained from the EPA. 

In [39]:
# query for pm2.5 values in Chicago, IL
air_quality_query = fidap.sql("""
WITH epa_chicago AS(
SELECT parameter_name, arithmetic_mean AS mean_pm25_value, sample_duration, 
  STRING(date_local) AS obs_date,
  ST_GEOGPOINT(longitude, latitude) AS measuring_stn_geom,
  address
from bigquery-public-data.epa_historical_air_quality.pm25_nonfrm_daily_summary
WHERE state_name = 'Illinois' 
AND city_name = 'Chicago'
AND STRING(date_local) LIKE '2018%'
AND sample_duration LIKE '24%')
SELECT *,
CASE WHEN address LIKE '%LAWNDALE%' THEN 'lawndale' ELSE 'springfield' END AS stn
FROM epa_chicago;
""")

In [44]:
poor_air_quality_df = air_quality_query.loc[air_quality_query.mean_pm25_value>=12.0,] 
# number of unique days where air quality was poor by EPA standards
print('In 2018, across the 2 measuring stations in Chicago, IL, there were a total of ' + str(poor_air_quality_df.obs_date.nunique()) + ' days with poor air quality.')

# number of poor air quality days by station
poor_air_quality_stn_df = poor_air_quality_df.groupby('stn').agg('count').reset_index()
poor_air_quality_stn_df = poor_air_quality_stn_df.loc[:,['stn', 'address']].rename(columns = {'address':'n_days'})

In 2018, across the 2 measuring stations in Chicago, IL, there were a total of 110 days with poor air quality.


In [50]:
# query each zip code's distance from each measuring station 
distance_stn_zip = fidap.sql("""
SELECT ST_DISTANCE(zip_code_geom, ST_GEOGPOINT(-87.722673, 41.912739))/1600 AS springfield,
ST_DISTANCE(zip_code_geom, ST_GEOGPOINT(-87.713488, 41.7514))/1600 AS lawndale, 
zip_code
FROM bigquery-public-data.geo_us_boundaries.zip_codes
WHERE zip_code LIKE '606%'
""")

In [None]:
# number of poor air quality days by zip code
distance_stn_zip = distance_stn_zip.assign(
    closest = np.where(distance_stn_zip['lawndale'] < distance_stn_zip['springfield'],
                      'lawndale','springfield')
)
distance_stn_zip = distance_stn_zip.merge(poor_air_quality_stn_df, left_on = 'closest', right_on = 'stn')

poor_quality_by_zip_closest_stn = distance_stn_zip.loc[:,['zip_code', 'closest', 'n_days']]


poor_quality_by_zip_closest_stn.head(n=5)

Finally, we can also count the number of severe storms within the county, or obtain the centroid of each incident, then group and count by zip code.

In [47]:
severe_storms_query = fidap.sql("""
SELECT event_type, event_id, event_begin_time, event_end_time, damage_property, deaths_direct, injuries_direct, deaths_indirect, injuries_indirect, event_longitude, event_latitude, event_range
FROM bigquery-public-data.noaa_historic_severe_storms.storms_2018
WHERE state_fips_code = '17'
AND cz_fips_code = '31'
""")

In [48]:
severe_storms_query_zip_code = fidap.sql("""
WITH chicago_zip AS(
SELECT zip_code, zip_code_geom 
FROM bigquery-public-data.geo_us_boundaries.zip_codes
WHERE zip_code LIKE '606%'
)
SELECT COUNT(*) AS severe_storm_count, z.zip_code
FROM bigquery-public-data.noaa_historic_severe_storms.storms_2018 AS s, chicago_zip as z
WHERE s.state_fips_code = '17'
AND s.cz_fips_code = '31'
AND ST_CONTAINS(z.zip_code_geom, ST_GEOGPOINT(s.event_longitude, s.event_latitude))
GROUP BY z.zip_code
""")

severe_storms_query_zip_code.head(n=5)

Unnamed: 0,severe_storm_count,zip_code
0,1,60634
1,1,60626
2,1,60641
3,1,60613
4,1,60657
