## Retrieve all data being passed through state and county codes. 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pprint import pprint
import requests
import json
import copy
from typing import List
from urllib.request import urlretrieve
from urllib.parse import quote
import csv

# Import API key
from config import key
from config import email_key

Universal Params. Uses your own email_key and key. Set begin and end date.

In [2]:
# Universal Params. Uses your own email_key and key. Set begin and end date.
params_air = {
    'email': email_key,
    'key': key,
    'param': (88101,88502),
    'bdate': '20190101',
    'edate': '20191201'
}    

In [3]:
#delete current dataframe
#del dict_state_county_df
# Empty dict to fill
dict_state_county_df = {}

In [4]:
# Function: Goal of function is to pass a list of state codes, county codes, and name of state.
# After receiving parameters, function creates a dict filled with dataframes. The values in the dataframe
# include select json api data. Each dataframe is seperated by state.

# Beginning of function.
def StateCountySampleDataDF(
    state_counties: List[tuple], params_air: dict, dict_state_county_df: dict) -> pd.DataFrame:
    # List of column headers/ List of parameters to grab
    data_to_gather_list = [
        "state_code",
        "state",
        "county_code",
        "county",
        "site_number",
        "parameter",
        "sample_measurement",
        "units_of_measure",
        "latitude",
        "longitude",
        "date_local",
        "time_local",
        "date_gmt",
        "time_gmt",
        "method_type",
        "method_code",
        "method",
        "date_of_last_change",
        "cbsa_code",
    ]
    # API URL: Specifically sample data from AQS EPA.gov involving state and county data
    base_url = "https://aqs.epa.gov/data/api/sampleData/byCounty?"

    # Obtain list of results to use from API call. 
    # Return 'dict_state_county_df': will have all data we are requesting
    try_fill_dataframe(state_counties, data_to_gather_list, params_air, base_url, dict_state_county_df)
    # return dict after completed
    #return dict_state_county_df
    return print(f'''
    Data gathering completed. The following dataframes were created:
    {dict_state_county_df.keys()}
                 ''')

from typing import List, Dict, Tuple
state_county_names = Tuple[str]

# Verify: Verifies correct format is passed through main function. Tuple with 3 parameters is expected.
def try_fill_dataframe(
    state_counties: List[state_county_names],
    data_to_gather_list: List[str],
    params_air: Dict[str, str],
    base_url: str, 
    state_county_dataframes: Dict[str, pd.DataFrame]) -> None:

    """Tries to fill dataframe for each state/county/name combo, after validation.
    """
    try:
        for sc in state_counties:
            # Verifies parameter input is in correct format. Expecting tuple.
            validate_state_and_county(sc)
            # Create df with default name. Will be replaces with 3rd parameter in function. 
            # Generates dataframe and fills in values from requested json.
            get_results_and_fill_dataframe(
                sc, params_air, base_url, data_to_gather_list, dict_state_county_df
            )
    except (ValueError, TypeError):
        print("Parameters do not match. Needs int,int,string as parameters")

# validates tuble being passed. must be strings. Last sting is state name.
def validate_state_and_county(sc: state_county_names) -> None:
    # sc example: ("06", "037", "California")
    try:
        int(sc[0])
        int(sc[1])
    except:
        raise

# Main call to get results and import values into their own dataframe.
def get_results_and_fill_dataframe(
    sc: tuple, params_air, base_url, data_to_gather_list, dict_state_county_df
):
    # Tuple being passed. Extracting by position in tuple. Passing same var multiple times. 
    state, county, state_name = sc[0], sc[1], sc[2]
    # label dataframe created by state name, state code, county code. 
    keyname = f"df_{state_name}_{state}_{county}"
    # API Call: build url, get response
    results_use = get_response(params_air, state, county, base_url)
    # Rename default dataframe name to custom added in sc2
    create_empty_df_then_fill_in_loop(
        results_use, data_to_gather_list, dict_state_county_df, keyname
    )

# Get response: Params settings, request for json, Filter to Data dict only.
def get_response(params_air: dict, state: str, county: str, base_url: str):
    params_air["state"] = state
    params_air["county"] = county
    response = requests.get(base_url, params=params_air).json()
    results_use = response["Data"]
    return results_use

# Beginning of collecting all data into dataframe. Builds empty dataframe, adds headers, values by keyname
def create_empty_df_then_fill_in_loop(
    results_use, data_to_gather_list, dict_state_county_df, keyname
):
    # Build empty dataframe
    raw_air_data_df = pd.DataFrame(columns=data_to_gather_list)
    # Add keyname to empty dict as key. Make hard copy
    dict_state_county_df[keyname] = copy.deepcopy(raw_air_data_df)
    # Gather json data into data2 dict. Add to new dict_state_county_df dict holding all dataframes
    while_loop_over_results(
        results_use, data_to_gather_list, keyname, dict_state_county_df
    )

# Loop over requested json. Get len of results, gather only requested keys
def while_loop_over_results(
    results_use, data_to_gather_list, keyname, dict_state_county_df
):
    x = 0
    # Amount of data being passed ( this needs to be len(results_use))
    # Modify to pass only a few for testing.
    while x < len(results_use):
        # list of data to gather. reflects column headers and data keys
        # Verify: Verifying input from requested json data. Only select keys we have chosen. If not, except error.
        try_fill_out_data2(
            data_to_gather_list, x, keyname, dict_state_county_df, results_use
        )
        x = x + 1

# Verify: Verifying input from requested json data. Only select keys we have chosen. If not, except error.
def try_fill_out_data2(
    data_to_gather_list, x, keyname, dict_state_county_df, results_use
):
    try:
        # data2 is main data being imported into dataframe. 
        data2 = fill_data2(data_to_gather_list, x, results_use)
        # Add values to dict before next county list in city. City contains possibly thousands of lists per time period requested. 
        dict_state_county_df[keyname].loc[x] = data2
    except Exception as e:
        print("Couldnt find value.. Moving on. DataFrame may be empty")
# Import json data into dataframes. Main data we are requesting is saved here.
def fill_data2(data_to_gather_list, x, results_use) -> dict:
    # Empty dict to fill during while loop. This is selected data in json after request.
    data2 = {}
    for label in data_to_gather_list:
        data2[label] = results_use[x][label]
    return data2
    
# Function call
# ex: StateCountySampleDataDF( state_counties, params_air)
# ex of tuple being passed: state_counties = [("06", "037", "California"), ("04", "007", "Arizona")]
# Tuple is passed. will go through list of cities and counties per state.

state_counties = [("06", "037", "California"), ("04", "007", "Arizona")]
StateCountySampleDataDF(state_counties, params_air, dict_state_county_df)


    Data gathering completed. The following dataframes were created:
    dict_keys(['df_California_06_037', 'df_Arizona_04_007'])
                 


### Finished.
Data has been saved to csv below. It will be saved after verifying. No need to run API again unless updating data.

## Request data in DataFrame

List all dataframes created.

In [5]:
dict_state_county_df.keys()

dict_keys(['df_California_06_037', 'df_Arizona_04_007'])

California Sample

In [6]:
# Import from .CSV
# pd.read_csv("california_data.csv") 

# Open California dataset
california_df = dict_state_county_df['df_California_06_037']
california_df.head()

Unnamed: 0,state_code,state,county_code,county,site_number,parameter,sample_measurement,units_of_measure,latitude,longitude,date_local,time_local,date_gmt,time_gmt,method_type,method_code,method,date_of_last_change,cbsa_code
0,6,California,37,Los Angeles,9034,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),34.813034,-118.884819,2019-06-26,00:00,2019-06-26,08:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-04-15,31080
1,6,California,37,Los Angeles,9034,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),34.813034,-118.884819,2019-04-30,00:00,2019-04-30,08:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-03-04,31080
2,6,California,37,Los Angeles,9034,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),34.813034,-118.884819,2019-04-27,00:00,2019-04-27,08:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-03-04,31080
3,6,California,37,Los Angeles,9034,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),34.813034,-118.884819,2019-04-24,00:00,2019-04-24,08:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-03-04,31080
4,6,California,37,Los Angeles,9034,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),34.813034,-118.884819,2019-04-03,00:00,2019-04-03,08:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-03-04,31080


In [7]:
# Save to .csv
california_df.to_csv('./california_data.csv', index=False)

Arizona Sample

In [8]:
# Import from .CSV
# pd.read_csv("arizona_data.csv") 

# Open Arizona dataset
arizona_df = dict_state_county_df['df_Arizona_04_007']
arizona_df.head()

Unnamed: 0,state_code,state,county_code,county,site_number,parameter,sample_measurement,units_of_measure,latitude,longitude,date_local,time_local,date_gmt,time_gmt,method_type,method_code,method,date_of_last_change,cbsa_code
0,4,Arizona,7,Gila,10,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),33.6547,-111.1074,2019-06-29,00:00,2019-06-29,07:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-04-15,37740
1,4,Arizona,7,Gila,10,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),33.6547,-111.1074,2019-06-26,00:00,2019-06-26,07:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-04-15,37740
2,4,Arizona,7,Gila,10,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),33.6547,-111.1074,2019-04-30,00:00,2019-04-30,07:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-03-04,37740
3,4,Arizona,7,Gila,10,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),33.6547,-111.1074,2019-04-27,00:00,2019-04-27,07:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-03-04,37740
4,4,Arizona,7,Gila,10,Acceptable PM2.5 AQI & Speciation Mass,,Micrograms/cubic meter (LC),33.6547,-111.1074,2019-04-24,00:00,2019-04-24,07:00,Non-FRM,707,IMPROVE Module A with Cyclone Inlet-Teflon Fil...,2020-03-04,37740


In [9]:
# Save to .csv
arizona_df.to_csv('./arizona_data.csv', index=False)