In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import requests
from urllib.parse import urlencode
from yarl import URL
from time import sleep

In [2]:
lbw = pd.read_csv('data/Natality_by_year_2007-2021.txt', sep="\t")
lbw.columns = [col.lower().replace(' ', '_') for col in lbw.columns]

In [3]:
lbw.drop(columns='notes').dropna().sort_values(by='year', ascending=False)

Unnamed: 0,state,state_code,county,county_code,infant_birth_weight_12,infant_birth_weight_12_code,year,year_code,births,%_of_total_births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
21213,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2021.0,2021.0,230.0,0.01%,2303.27,36.19,36.30
11144,Missouri,29.0,"St. Charles County, MO",29183.0,2000 - 2499 grams,5.0,2021.0,2021.0,94.0,0.00%,2324.00,36.36,35.80
11120,Missouri,29.0,"St. Charles County, MO",29183.0,1000 - 1499 grams,3.0,2021.0,2021.0,20.0,0.00%,1277.05,30.30,30.05
1955,California,6.0,"San Diego County, CA",6073.0,1000 - 1499 grams,3.0,2021.0,2021.0,143.0,0.01%,1277.10,30.43,30.19
11108,Missouri,29.0,"St. Charles County, MO",29183.0,500 - 999 grams,2.0,2021.0,2021.0,10.0,0.00%,753.10,30.00,25.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7788,Iowa,19.0,"Unidentified Counties, IA",19999.0,500 - 999 grams,2.0,2007.0,2007.0,48.0,0.00%,785.85,26.13,25.72
15709,Ohio,39.0,"Warren County, OH",39165.0,1500 - 1999 grams,4.0,2007.0,2007.0,19.0,0.00%,1774.05,33.26,32.89
19108,Texas,48.0,"Travis County, TX",48453.0,500 - 999 grams,2.0,2007.0,2007.0,71.0,0.00%,759.96,25.44,25.31
15704,Ohio,39.0,"Warren County, OH",39165.0,1000 - 1499 grams,3.0,2007.0,2007.0,15.0,0.00%,1188.47,31.20,29.80


In [None]:
data_path = Path('data/')
aqi_files = [file for file in data_path.glob('*aqi*') if file.is_file()]
aqi_files.sort()
aqi = pd.read_csv(aqi_files[0])
for file in aqi_files[1:]:
    aqi = pd.concat([aqi, pd.read_csv(file)])
aqi.columns = [col.lower().replace(' ', '_') for col in aqi.columns]
aqi.to_csv('data/aqi_by_year_2006-2021.csv', index=False)

In [None]:
aqi.sort_values(by='days_with_aqi').sample(50)

In [None]:
aqi['year'].value_counts()

## Getting data from the AQS API

In [3]:
email = f'winston.degraw@gmail.com'
key = 'carmelswift48'

In [4]:
def get_state_codes(email, key):
    '''
    Queries AQS to get the list of state codes
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/states'
    params = {
        'email' : email,
        'key'   : key  
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts
    else:
        print(f'status: {res.status_code}')
    return None

#### Only need to query the state list once, so storing it to a variable to loop over.

In [5]:
state_codes = get_state_codes(email,key)
state_codes

Unnamed: 0,code,value_represented
0,01,Alabama
1,02,Alaska
2,04,Arizona
3,05,Arkansas
4,06,California
5,08,Colorado
6,09,Connecticut
7,10,Delaware
8,11,District Of Columbia
9,12,Florida


In [6]:
def get_county_codes(email, key, state):
    '''
    Queries AQS to get the list of county codes for the given state
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/countiesByState'
    params = {
        'email' : email,
        'key'   : key  ,
        'state' : state
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts['code']
    else:
        print(f'status: {res.status_code}')
    return None

In [7]:
def get_param_class_codes(email, key):
    '''
    Queries AQS to get the codes representing the classes of measured air pollutants
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/classes'
    params = {
        'email' : email,
        'key'   : key  
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts
    else:
        print(f'status: {res.status_code}')
    return None

In [8]:
get_param_class_codes(email,key)

Unnamed: 0,code,value_represented
0,AIRNOW MAPS,The parameters represented on AirNow maps (881...
1,ALL,Select all Parameters Available
2,AQI POLLUTANTS,Pollutants that have an AQI Defined
3,CORE_HAPS,Urban Air Toxic Pollutants
4,CRITERIA,Criteria Pollutants
5,CSN DART,List of CSN speciation parameters to populate ...
6,FORECAST,Parameters routinely extracted by AirNow (STI)
7,HAPS,Hazardous Air Pollutants
8,IMPROVE CARBON,IMPROVE Carbon Parameters
9,IMPROVE_SPECIATION,PM2.5 Speciated Parameters Measured at IMPROVE...


#### The `CRITERIA` class seems to be the suggested parameter class, so going to use that (`AQI POLLUTANTS` contains basically the same info, but with some aggregate stats as well that we don't want)

In [9]:
def get_param_codes_by_class(email, key, pc):
    '''
    Queries AQS to get the codes representing the parameters in parameter class (pc)
    '''
    base_url = 'https://aqs.epa.gov/data/api/list/parametersByClass'
    params = {
        'email' : email,
        'key'   : key  ,
        'pc'    : pc
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=True)
    res = requests.get(url)

    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts['code']
    else:
        print(f'status: {res.status_code}')
    return None

#### Also only need to get this once, so storing it to a variable

In [10]:
param_codes = get_param_codes_by_class(email, key, 'CRITERIA').astype(str)
params_1 = ','.join(param_codes[:4].values)
params_2 = ','.join(param_codes[4:].values)
params = [params_1, params_2]

In [11]:
def aqs_api_annual_county(email, key, param, bdate, edate, state, county):
    '''
    Query the AQS API to get the annual summary data for the given parameter, state, county
    bdate and edate have to be in the same year for the request to work
    '''
    base_url = 'https://aqs.epa.gov/data/api/annualData/byCounty'
    params = {
        'email' : email,
        'key'   : key  ,
        'param' : param,
        'bdate' : bdate,
        'edate' : edate,
        'state' : state,
        'county': county
    }
    
    url = URL('?'.join([base_url, urlencode(params)]), encoded=False)
    res = requests.get(url)
    
    if res.status_code == 200:
        posts = pd.DataFrame(res.json()['Data'])
        return posts
    else:
        print(f'status: {res.status_code}')
        print(res.json())
    return None

### Now going to try to loop over all states, for each state loop over each year, for each year loop over each parameter, and for each parameter loop over each county in the state to gather the data for that county. Will have to query AQS for each state to get the county codes.

In [12]:
# put in beginning year, end year + 1
years = np.arange(2006, 2007).astype(str)
years

array(['2006'], dtype='<U21')

In [14]:
state_codes

Unnamed: 0,code,value_represented
0,01,Alabama
1,02,Alaska
2,04,Arizona
3,05,Arkansas
4,06,California
5,08,Colorado
6,09,Connecticut
7,10,Delaware
8,11,District Of Columbia
9,12,Florida


In [20]:
state_codes.iloc[state_codes[state_codes['value_represented']=='California'].index[0]:]

Unnamed: 0,code,value_represented
4,06,California
5,08,Colorado
6,09,Connecticut
7,10,Delaware
8,11,District Of Columbia
9,12,Florida
10,13,Georgia
11,15,Hawaii
12,16,Idaho
13,17,Illinois


In [96]:
data_list = []
for state_code, state in state_codes.values[:1]:
# Query for county codes at the top to avoid repeated queries
    county_codes = get_county_codes(email,key,state_code)
    for year in years:
        print('--------------------')
        print(f'Collecting data for state {state} in year {year}')
        print('--------------------')
        for county in county_codes:
            for param in params:
                data_list.append( aqs_api_annual_county(email, key, param, year+'0101', year+'1231', state_code, county) )

--------------------
Collecting data for state Alabama in year 2006
--------------------


In [98]:
data = pd.concat(data_list, ignore_index=True)
data

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration_code,...,fiftieth_percentile,tenth_percentile,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,01,003,0010,44201,1,30.497478,-87.880258,NAD83,Ozone,1,...,0.056,0.035,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,19300,"Daphne-Fairhope-Foley, AL",2021-12-21
1,01,003,0010,44201,1,30.497478,-87.880258,NAD83,Ozone,W,...,0.050,0.031,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,19300,"Daphne-Fairhope-Foley, AL",2021-12-21
2,01,003,0010,44201,1,30.497478,-87.880258,NAD83,Ozone,W,...,0.050,0.031,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,19300,"Daphne-Fairhope-Foley, AL",2021-12-21
3,01,003,0010,88101,1,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,7,...,10.400,5.000,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,19300,"Daphne-Fairhope-Foley, AL",2023-01-09
4,01,003,0010,88101,1,30.497478,-87.880258,NAD83,PM2.5 - Local Conditions,7,...,10.400,5.000,"FAIRHOPE, Alabama","FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE...",Alabama,Baldwin,Fairhope,19300,"Daphne-Fairhope-Foley, AL",2023-01-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,01,127,0002,88101,1,33.832885,-87.272505,NAD83,PM2.5 - Local Conditions,7,...,13.100,5.700,JASPER,HIGHLAND AVE. AND 17TH ST. EAST,Alabama,Walker,Jasper,13820,"Birmingham-Hoover, AL",2021-11-09
392,01,127,0002,88101,1,33.832885,-87.272505,NAD83,PM2.5 - Local Conditions,7,...,13.100,5.700,JASPER,HIGHLAND AVE. AND 17TH ST. EAST,Alabama,Walker,Jasper,13820,"Birmingham-Hoover, AL",2021-11-09
393,01,127,0002,88101,1,33.832885,-87.272505,NAD83,PM2.5 - Local Conditions,7,...,13.100,5.700,JASPER,HIGHLAND AVE. AND 17TH ST. EAST,Alabama,Walker,Jasper,13820,"Birmingham-Hoover, AL",2021-11-09
394,01,127,0002,88101,1,33.832885,-87.272505,NAD83,PM2.5 - Local Conditions,7,...,13.100,5.700,JASPER,HIGHLAND AVE. AND 17TH ST. EAST,Alabama,Walker,Jasper,13820,"Birmingham-Hoover, AL",2021-11-09


In [99]:
data.to_csv('data/AQS_data_test_Alabama_2006.csv', index=False)

In [25]:
data_list=['blah']
start_state='arizona'
start_state_index = state_codes[state_codes['value_represented'].str.lower()\
                                == start_state.lower()].index[0]

print(start_state_index)

2


In [24]:
state_count = 0

# loop over all desired states, years, counties, and parameters. 
# not including Canada, Mexico, and start from whichever state was input (if any)
for state_code, state in state_codes.values[:-2]:
    state_count += 1
    
    # check if there is state data already, if so then write it to a csv
    if len(data_list)>0:
        print(state_codes["value_represented"].iloc[state_count-1])
        data = pd.concat(data_list, ignore_index=True)
        if beginning_year == end_year:
            data.to_csv(f'data/AQS_county_data_{state_codes["value_represented"].iloc[state_count-1]}_{beginning_year}.csv', index=False)
        else:
            data.to_csv(f'data/AQS_county_data_{state_codes["value_represented"].iloc[state_count-1]}_{beginning_year}_{end_year}.csv', index=False)
    
    # make sure data_list is empty before filling it up again
    data_list=[]
    
    # Query for county codes at the top to avoid repeated queries
    county_codes = get_county_codes(email,key,state_code)
    for year in years:
        print('--------------------')
        print(f'Collecting data for state {state} in year {year} ({state_count}/{len(state_codes[:-2])})')
        print('--------------------')
        county_count = 0
        for county in county_codes:
            county_count += 1
            print(f'County {county_count}/{len(county_codes)}')
            for param in params:
                data_list.append( aqs_api_annual_county(email, key, param, year+'0101', year+'1231', state_code, county) )
                # Have to sleep for API to be happy
                sleep(2)

Alabama


TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid

## Time to see what kind of cleaning to expect from the AQS data

In [107]:
aqs = pd.read_csv('data/AQS_data_test_Alabama_2006.csv')

In [108]:
aqs['parameter_code'].unique()

array([44201, 88101, 42401, 81102, 42101])

#### Many, many useless columns, going to start with deleting these
#### Begin with everything up to longitude

In [109]:
aqs.columns

Index(['state_code', 'county_code', 'site_number', 'parameter_code', 'poc',
       'latitude', 'longitude', 'datum', 'parameter', 'sample_duration_code',
       'sample_duration', 'pollutant_standard', 'metric_used', 'method',
       'year', 'units_of_measure', 'event_type', 'observation_count',
       'observation_percent', 'validity_indicator', 'valid_day_count',
       'required_day_count', 'exceptional_data_count',
       'null_observation_count', 'primary_exceedance_count',
       'secondary_exceedance_count', 'certification_indicator',
       'arithmetic_mean', 'standard_deviation', 'first_max_value',
       'first_max_datetime', 'second_max_value', 'second_max_datetime',
       'third_max_value', 'third_max_datetime', 'fourth_max_value',
       'fourth_max_datetime', 'first_max_nonoverlap_value',
       'first_max_n_o_datetime', 'second_max_nonoverlap_value',
       'second_max_n_o_datetime', 'ninety_ninth_percentile',
       'ninety_eighth_percentile', 'ninety_fifth_percentile'

In [110]:
columns_to_drop = list(aqs.columns[:7])
aqs.drop(aqs.columns[:7], axis=1, inplace=True)

In [111]:
columns_to_drop

['state_code',
 'county_code',
 'site_number',
 'parameter_code',
 'poc',
 'latitude',
 'longitude']

In [117]:
aqs['sample_duration_code'].unique()

array(['1', 'W', '7', 'X', 'Y', 'Z'], dtype=object)

In [118]:
aqs['county'].unique()

array(['Baldwin', 'Clay', 'Colbert', 'DeKalb', 'Elmore', 'Escambia',
       'Etowah', 'Houston', 'Jackson', 'Jefferson', 'Lawrence', 'Madison',
       'Marengo', 'Mobile', 'Montgomery', 'Morgan', 'Pike', 'Russell',
       'Shelby', 'Sumter', 'Talladega', 'Tuscaloosa', 'Walker'],
      dtype=object)

In [119]:
aqs[aqs['sample_duration_code']=='X']

Unnamed: 0,datum,parameter,sample_duration_code,sample_duration,pollutant_standard,metric_used,method,year,units_of_measure,event_type,...,fiftieth_percentile,tenth_percentile,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
22,WGS84,Sulfur dioxide,X,24-HR BLK AVG,SO2 24-hour 1971,Daily Average of observed values,,2006,Parts per billion,No Events,...,1.1,1.0,,TVA COLBERT 14___3.98 MI SE COLBERT FP,Alabama,Colbert,,22520.0,"Florence-Muscle Shoals, AL",2021-11-09
79,WGS84,Sulfur dioxide,X,24-HR BLK AVG,SO2 24-hour 1971,Daily Average of observed values,,2006,Parts per billion,No Events,...,1.4,1.0,,TVA WIDOWS CRK 11 2.0 MI ESE WIDOWS CRK,Alabama,Jackson,Not in a city,42460.0,"Scottsboro, AL",2021-11-09
89,WGS84,Sulfur dioxide,X,24-HR BLK AVG,SO2 24-hour 1971,Daily Average of observed values,,2006,Parts per billion,No Events,...,1.9,0.5,Fairfield,"FAIRFIELD, PFD, 5229 COURT B",Alabama,Jefferson,Fairfield,13820.0,"Birmingham-Hoover, AL",2021-11-09
128,WGS84,PM10 Total 0-10um STP,X,24-HR BLK AVG,PM10 24-hour 2006,Daily Mean,,2006,Micrograms/cubic meter (25 C),No Events,...,33.0,14.0,Sloss Shuttlesworth,4113 SHUTTLESWORTH DRIVE,Alabama,Jefferson,Birmingham,13820.0,"Birmingham-Hoover, AL",2021-11-08
130,WGS84,PM10 Total 0-10um STP,X,24-HR BLK AVG,PM10 24-hour 2006,Daily Mean,,2006,Micrograms/cubic meter (25 C),No Events,...,31.0,10.0,,44-SWEET AVENUE,Alabama,Jefferson,Tarrant (corporate name for Tarrant City),13820.0,"Birmingham-Hoover, AL",2021-11-08
133,WGS84,PM10 Total 0-10um STP,X,24-HR BLK AVG,PM10 24-hour 2006,Daily Mean,,2006,Micrograms/cubic meter (25 C),No Events,...,25.0,10.0,Wylam,1242 JERSEY ST WYLAM AL,Alabama,Jefferson,Birmingham,13820.0,"Birmingham-Hoover, AL",2021-11-08
140,WGS84,PM10 Total 0-10um STP,X,24-HR BLK AVG,PM10 24-hour 2006,Daily Mean,,2006,Micrograms/cubic meter (25 C),No Events,...,28.0,11.0,North Birmingham,"NO. B'HAM,SOU R.R., 3009 28TH ST. NO.",Alabama,Jefferson,Birmingham,13820.0,"Birmingham-Hoover, AL",2021-11-08
259,NAD83,Sulfur dioxide,X,24-HR BLK AVG,SO2 24-hour 1971,Daily Average of observed values,,2006,Parts per billion,No Events,...,0.4,0.0,BAY ROAD,"BAY RD. ,MOBILE AL.",Alabama,Mobile,Theodore,33660.0,"Mobile, AL",2021-11-09
280,NAD83,PM10 Total 0-10um STP,X,24-HR BLK AVG,PM10 24-hour 2006,Daily Mean,,2006,Micrograms/cubic meter (25 C),No Events,...,21.0,12.0,CHICKASAW,"Iroquois and Azalea, CHICKASAW, MOBILE CO., A...",Alabama,Mobile,Chickasaw,33660.0,"Mobile, AL",2021-11-08


#### `state` and `county` are what we will ultimately want to combine the data from the CDC on, so `city`, `cbsa*` are unecessary 

In [59]:
aqs.drop(aqs.columns[-4:],axis=1, inplace=True)

In [60]:
aqs.drop(['local_site_name','site_address'], axis=1, inplace=True)

In [61]:
aqs.columns

Index(['datum', 'parameter', 'sample_duration_code', 'sample_duration',
       'pollutant_standard', 'metric_used', 'method', 'year',
       'units_of_measure', 'event_type', 'observation_count',
       'observation_percent', 'validity_indicator', 'valid_day_count',
       'required_day_count', 'exceptional_data_count',
       'null_observation_count', 'primary_exceedance_count',
       'secondary_exceedance_count', 'certification_indicator',
       'arithmetic_mean', 'standard_deviation', 'first_max_value',
       'first_max_datetime', 'second_max_value', 'second_max_datetime',
       'third_max_value', 'third_max_datetime', 'fourth_max_value',
       'fourth_max_datetime', 'first_max_nonoverlap_value',
       'first_max_n_o_datetime', 'second_max_nonoverlap_value',
       'second_max_n_o_datetime', 'ninety_ninth_percentile',
       'ninety_eighth_percentile', 'ninety_fifth_percentile',
       'ninetieth_percentile', 'seventy_fifth_percentile',
       'fiftieth_percentile', 'tenth_per