In [405]:
import pandas as pd
import numpy as np
from pathlib import Path
import requests
from urllib.parse import urlencode
from yarl import URL

## Working on getting data from the AQS API

In [323]:
email = f'winston.degraw@gmail.com'
key = 'carmelswift48'

In [287]:
url = f'https://aqs.epa.gov/data/api/list/countiesByState?email={email}&key={key}&state=37'

In [73]:
url = f'https://aqs.epa.gov/data/api/list/states?email={email}&key={key}'

In [67]:
url = f'https://aqs.epa.gov/data/api/list/parametersByClass?email={email}&key={key}&pc=CRITERIA'

In [288]:
res = requests.get(url)

In [289]:
res.status_code

200

In [290]:
res.json()['Data']

[{'code': '001', 'value_represented': 'Alamance'},
 {'code': '003', 'value_represented': 'Alexander'},
 {'code': '005', 'value_represented': 'Alleghany'},
 {'code': '007', 'value_represented': 'Anson'},
 {'code': '009', 'value_represented': 'Ashe'},
 {'code': '011', 'value_represented': 'Avery'},
 {'code': '013', 'value_represented': 'Beaufort'},
 {'code': '015', 'value_represented': 'Bertie'},
 {'code': '017', 'value_represented': 'Bladen'},
 {'code': '019', 'value_represented': 'Brunswick'},
 {'code': '021', 'value_represented': 'Buncombe'},
 {'code': '023', 'value_represented': 'Burke'},
 {'code': '025', 'value_represented': 'Cabarrus'},
 {'code': '027', 'value_represented': 'Caldwell'},
 {'code': '029', 'value_represented': 'Camden'},
 {'code': '031', 'value_represented': 'Carteret'},
 {'code': '033', 'value_represented': 'Caswell'},
 {'code': '035', 'value_represented': 'Catawba'},
 {'code': '037', 'value_represented': 'Chatham'},
 {'code': '039', 'value_represented': 'Cherokee'},

In [74]:
res_states = requests.get(url)

In [76]:
res_states.json()['Data']

[{'code': '01', 'value_represented': 'Alabama'},
 {'code': '02', 'value_represented': 'Alaska'},
 {'code': '04', 'value_represented': 'Arizona'},
 {'code': '05', 'value_represented': 'Arkansas'},
 {'code': '06', 'value_represented': 'California'},
 {'code': '08', 'value_represented': 'Colorado'},
 {'code': '09', 'value_represented': 'Connecticut'},
 {'code': '10', 'value_represented': 'Delaware'},
 {'code': '11', 'value_represented': 'District Of Columbia'},
 {'code': '12', 'value_represented': 'Florida'},
 {'code': '13', 'value_represented': 'Georgia'},
 {'code': '15', 'value_represented': 'Hawaii'},
 {'code': '16', 'value_represented': 'Idaho'},
 {'code': '17', 'value_represented': 'Illinois'},
 {'code': '18', 'value_represented': 'Indiana'},
 {'code': '19', 'value_represented': 'Iowa'},
 {'code': '20', 'value_represented': 'Kansas'},
 {'code': '21', 'value_represented': 'Kentucky'},
 {'code': '22', 'value_represented': 'Louisiana'},
 {'code': '23', 'value_represented': 'Maine'},
 {'

In [78]:
res_counties = requests.get(url)

In [228]:
res_counties.json()['Data']

[{'code': '001', 'value_represented': 'Alamance'},
 {'code': '003', 'value_represented': 'Alexander'},
 {'code': '005', 'value_represented': 'Alleghany'},
 {'code': '007', 'value_represented': 'Anson'},
 {'code': '009', 'value_represented': 'Ashe'},
 {'code': '011', 'value_represented': 'Avery'},
 {'code': '013', 'value_represented': 'Beaufort'},
 {'code': '015', 'value_represented': 'Bertie'},
 {'code': '017', 'value_represented': 'Bladen'},
 {'code': '019', 'value_represented': 'Brunswick'},
 {'code': '021', 'value_represented': 'Buncombe'},
 {'code': '023', 'value_represented': 'Burke'},
 {'code': '025', 'value_represented': 'Cabarrus'},
 {'code': '027', 'value_represented': 'Caldwell'},
 {'code': '029', 'value_represented': 'Camden'},
 {'code': '031', 'value_represented': 'Carteret'},
 {'code': '033', 'value_represented': 'Caswell'},
 {'code': '035', 'value_represented': 'Catawba'},
 {'code': '037', 'value_represented': 'Chatham'},
 {'code': '039', 'value_represented': 'Cherokee'},

In [346]:
def get_state_codes(email, key):
    '''
    Queries AQS to get the list of state codes
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/states'
    params = {
        'email' : email,
        'key'   : key  
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts['code']
    else:
        print(f'status: {res.status_code}')

In [392]:
state_codes = get_state_codes(email,key)

In [339]:
def get_county_codes(email, key, state):
    '''
    Queries AQS to get the list of county codes for the given state
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/countiesByState'
    params = {
        'email' : email,
        'key'   : key  ,
        'state' : state
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts['code']
    else:
        print(f'status: {res.status_code}')

In [376]:
def get_param_class_codes(email, key):
    '''
    Queries AQS to get the codes representing the classes of measured air pollutants
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/classes'
    params = {
        'email' : email,
        'key'   : key  
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts
    else:
        print(f'status: {res.status_code}')

In [None]:
get_param_class_codes(email,key)

Unnamed: 0,code,value_represented
0,AIRNOW MAPS,The parameters represented on AirNow maps (881...
1,ALL,Select all Parameters Available
2,AQI POLLUTANTS,Pollutants that have an AQI Defined
3,CORE_HAPS,Urban Air Toxic Pollutants
4,CRITERIA,Criteria Pollutants
5,CSN DART,List of CSN speciation parameters to populate ...
6,FORECAST,Parameters routinely extracted by AirNow (STI)
7,HAPS,Hazardous Air Pollutants
8,IMPROVE CARBON,IMPROVE Carbon Parameters
9,IMPROVE_SPECIATION,PM2.5 Speciated Parameters Measured at IMPROVE...


In [399]:
def get_param_codes_by_class(email, key, pc):
    '''
    Queries AQS to get the codes representing the parameters in parameter class (pc)
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/parametersByClass'
    params = {
        'email' : email,
        'key'   : key  ,
        'pc'    : pc
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts['code']
    else:
        print(f'status: {res.status_code}')

In [400]:
param_codes = get_param_codes_by_class(email, key, 'CRITERIA')

In [353]:
def aqs_api_annual_county(email, key, param, bdate, edate, state, county):
    '''
    Query the AQS API to get the annual summary data for the given parameter, state, county
    bdate and edate have to be in the same year for the request to work
    '''
    base_url = 'https://aqs.epa.gov/data/api/annualData/byCounty'
    params = {
        'email' : email,
        'key'   : key  ,
        'param' : param,
        'bdate' : bdate,
        'edate' : edate,
        'state' : state,
        'county': county
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)
    
    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts
    else:
        print(f'status: {res.status_code}')

In [414]:
data = aqs_api_annual_county(email, key, '88502', '20100101', '20101231', '39', '001')

In [415]:
data

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration_code,...,fiftieth_percentile,tenth_percentile,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,39,1,1,88502,3,38.79471,-83.53398,NAD83,Acceptable PM2.5 AQI & Speciation Mass,1,...,11.8,1.6,West Union,210 N. WILSON DR.,Ohio,Adams,West Union,,,2020-05-21
1,39,1,1,88502,3,38.79471,-83.53398,NAD83,Acceptable PM2.5 AQI & Speciation Mass,X,...,12.5,7.5,West Union,210 N. WILSON DR.,Ohio,Adams,West Union,,,2020-05-21


#### Now going to try to loop over all states, for each state loop over each year, for each year loop over each parameter, and for each parameter loop over each county in the state to gather the data for that county. Will have to query AQS for each state to get the county codes.

In [413]:
years = np.arange(2005, 2022).astype(str)
years

array(['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021'], dtype='<U21')

In [None]:
data_list = []
for state in state_codes:
    for year in years:
        print('--------------------')
        print(f'Collecting data for state {state} in year {year}')
        print('--------------------')
        for param in param_codes:
            county_codes = get_county_codes(email,key,state)
            for county in county_codes:
                data_list.append( aqs_api_annual_county(email, key, param, year+'0101', year+'1231', state, county) )

--------------------
Collecting data for state 01 in year 2005
--------------------
--------------------
Collecting data for state 01 in year 2006
--------------------


In [None]:
url = f'https://aqs.epa.gov/data/api/annualData/byCounty?\
email={email}&key={key}&param=88101,88502&bdate=20160101&edate=20160229&state=37&county=183'


In [22]:
lbw = pd.read_csv('data/Natality_by_year_2007-2021.txt', sep="\t")
lbw.columns = [col.lower().replace(' ', '_') for col in lbw.columns]

In [29]:
lbw.drop(columns='notes').dropna().sort_values(by='year', ascending=False)

Unnamed: 0,state,state_code,county,county_code,infant_birth_weight_12,infant_birth_weight_12_code,year,year_code,births,%_of_total_births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
21213,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2021.0,2021.0,230.0,0.01%,2303.27,36.19,36.30
11144,Missouri,29.0,"St. Charles County, MO",29183.0,2000 - 2499 grams,5.0,2021.0,2021.0,94.0,0.00%,2324.00,36.36,35.80
11120,Missouri,29.0,"St. Charles County, MO",29183.0,1000 - 1499 grams,3.0,2021.0,2021.0,20.0,0.00%,1277.05,30.30,30.05
1955,California,6.0,"San Diego County, CA",6073.0,1000 - 1499 grams,3.0,2021.0,2021.0,143.0,0.01%,1277.10,30.43,30.19
11108,Missouri,29.0,"St. Charles County, MO",29183.0,500 - 999 grams,2.0,2021.0,2021.0,10.0,0.00%,753.10,30.00,25.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7788,Iowa,19.0,"Unidentified Counties, IA",19999.0,500 - 999 grams,2.0,2007.0,2007.0,48.0,0.00%,785.85,26.13,25.72
15709,Ohio,39.0,"Warren County, OH",39165.0,1500 - 1999 grams,4.0,2007.0,2007.0,19.0,0.00%,1774.05,33.26,32.89
19108,Texas,48.0,"Travis County, TX",48453.0,500 - 999 grams,2.0,2007.0,2007.0,71.0,0.00%,759.96,25.44,25.31
15704,Ohio,39.0,"Warren County, OH",39165.0,1000 - 1499 grams,3.0,2007.0,2007.0,15.0,0.00%,1188.47,31.20,29.80


In [48]:
from pathlib import Path
data_path = Path('data/')
aqi_files = [file for file in data_path.glob('*aqi*') if file.is_file()]
aqi_files.sort()
aqi = pd.read_csv(aqi_files[0])
for file in aqi_files[1:]:
    aqi = pd.concat([aqi, pd.read_csv(file)])
aqi.columns = [col.lower().replace(' ', '_') for col in aqi.columns]
aqi.to_csv('data/aqi_by_year_2006-2021.csv', index=False)

In [49]:
aqi.sort_values(by='days_with_aqi').sample(50)

Unnamed: 0,state,county,year,days_with_aqi,good_days,moderate_days,unhealthy_for_sensitive_groups_days,unhealthy_days,very_unhealthy_days,hazardous_days,max_aqi,90th_percentile_aqi,median_aqi,days_co,days_no2,days_ozone,days_pm2.5,days_pm10
798,Pennsylvania,Northampton,2011,365,165,189,9,2,0,0,152,84,53,0,9,89,264,3
395,Maryland,Harford,2019,359,261,89,9,0,0,0,133,71,44,0,0,231,128,0
81,California,Napa,2010,365,293,70,1,1,0,0,159,60,39,0,3,184,178,0
853,South Carolina,Greenville,2006,365,118,240,5,2,0,0,159,82,59,0,16,0,348,1
279,Indiana,Henry,2016,119,106,13,0,0,0,0,65,51,28,0,0,0,119,0
775,Pennsylvania,Luzerne,2012,366,329,35,2,0,0,0,129,51,33,0,0,348,0,18
262,Illinois,McHenry,2009,365,242,123,0,0,0,0,100,67,43,0,0,174,191,0
98,California,Ventura,2021,365,208,147,10,0,0,0,122,78,48,0,0,278,75,12
402,Maine,Knox,2007,179,151,22,4,2,0,0,177,71,34,0,0,179,0,0
300,Iowa,Pottawattamie,2021,124,100,23,1,0,0,0,117,57,33,0,0,0,120,4


In [408]:
aqi['year'].value_counts()

2006    1091
2007    1082
2008    1077
2009    1077
2010    1076
2011    1071
2012    1050
2013    1044
2015    1042
2014    1036
2016    1030
2017    1029
2018    1021
2019    1020
2020    1003
2021    1002
Name: year, dtype: int64