# Data Collection and Cleaning

Data has been collected from both the EPA for Air Quality Index data and from the CDC for birth data relating to weight.  

## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

## Data

### EPA AQI Data

The air quality data was downloaded from https://aqs.epa.gov/aqsweb/airdata/download_files.html#Annual as files containing annual data from years 2016-2021.  We are combining these CSV files into a single dataframe and will evaluate whether it will meet the needs for the project, or whether data will need to be collected through the API that the EPA offers for AirData.

In [100]:
# code from Winston merged with approach from
# https://towardsdatascience.com/pandas-concat-tricks-you-should-know-to-speed-up-your-data-analysis-cd3d4fdfe6dd

data_path = Path('data/')

# create a list of all the AQI csv files
dfs = (
    pd.read_csv(file) for file in data_path.glob('*aqi*') if file.is_file()
)

# concatenate the dataframes
res = pd.concat(dfs)

# export the final csv
res.to_csv('data/aqi_by_year_2006-2021.csv', index=False)

Read the final CSV for all the AQI data

In [101]:
aqi = pd.read_csv('data/aqi_by_year_2006-2021.csv')

In [102]:
aqi.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,2009,252,218,32,2,0,0,0,136,53,36,0,0,200,52,0
1,Alabama,Clay,2009,119,97,22,0,0,0,0,94,59,33,0,0,0,119,0
2,Alabama,Colbert,2009,323,220,103,0,0,0,0,76,60,43,0,0,132,191,0
3,Alabama,DeKalb,2009,363,311,52,0,0,0,0,100,54,36,0,0,308,55,0
4,Alabama,Elmore,2009,244,228,16,0,0,0,0,80,49,36,0,0,244,0,0


In [103]:
aqi.columns

Index(['State', 'County', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days PM2.5', 'Days PM10'],
      dtype='object')

In [104]:
aqi.shape

(33502, 18)

In [105]:
aqi.dtypes

State                                  object
County                                 object
Year                                    int64
Days with AQI                           int64
Good Days                               int64
Moderate Days                           int64
Unhealthy for Sensitive Groups Days     int64
Unhealthy Days                          int64
Very Unhealthy Days                     int64
Hazardous Days                          int64
Max AQI                                 int64
90th Percentile AQI                     int64
Median AQI                              int64
Days CO                                 int64
Days NO2                                int64
Days Ozone                              int64
Days PM2.5                              int64
Days PM10                               int64
dtype: object

In [106]:
aqi.columns = [col.lower().replace(' ', '_') for col in aqi.columns]

In [107]:
aqi.columns

Index(['state', 'county', 'year', 'days_with_aqi', 'good_days',
       'moderate_days', 'unhealthy_for_sensitive_groups_days',
       'unhealthy_days', 'very_unhealthy_days', 'hazardous_days', 'max_aqi',
       '90th_percentile_aqi', 'median_aqi', 'days_co', 'days_no2',
       'days_ozone', 'days_pm2.5', 'days_pm10'],
      dtype='object')

In [108]:
aqi.isnull().sum()

state                                  0
county                                 0
year                                   0
days_with_aqi                          0
good_days                              0
moderate_days                          0
unhealthy_for_sensitive_groups_days    0
unhealthy_days                         0
very_unhealthy_days                    0
hazardous_days                         0
max_aqi                                0
90th_percentile_aqi                    0
median_aqi                             0
days_co                                0
days_no2                               0
days_ozone                             0
days_pm2.5                             0
days_pm10                              0
dtype: int64

In [109]:
aqi[['state', 'county']].nunique()

state      55
county    913
dtype: int64

In [110]:
aqi['state'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Country Of Mexico', 'Delaware',
       'District Of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virgin Islands', 'Virginia', 'Washington', 'West Virginia',
       'Wisconsin', 'Wyoming', 'Canada'], dtype=object)

May want to drop locations such as:
* Country of Mexico
* Puerto Rico
* Virgin Islands
* Canada

In [111]:
aqi.describe()

Unnamed: 0,year,days_with_aqi,good_days,moderate_days,unhealthy_for_sensitive_groups_days,unhealthy_days,very_unhealthy_days,hazardous_days,max_aqi,90th_percentile_aqi,median_aqi,days_co,days_no2,days_ozone,days_pm2.5,days_pm10
count,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0
mean,2013.378843,304.81822,233.813563,64.906215,5.033132,0.920542,0.095994,0.048773,127.18375,61.495553,37.474599,1.425766,6.699361,169.352039,112.846815,14.494239
std,4.607776,91.250699,83.797024,51.877028,10.433271,4.05487,1.104159,0.679988,221.866188,19.566392,11.121959,15.505999,26.626984,119.843147,110.158979,53.624707
min,2006.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2009.0,248.0,176.0,25.0,0.0,0.0,0.0,0.0,89.0,50.0,33.0,0.0,0.0,0.0,0.0,0.0
50%,2013.0,360.0,249.0,53.0,1.0,0.0,0.0,0.0,112.0,59.0,39.0,0.0,0.0,185.0,93.0,0.0
75%,2017.0,365.0,304.0,93.0,6.0,0.0,0.0,0.0,145.0,71.0,44.0,0.0,0.0,245.0,183.0,1.0
max,2021.0,366.0,365.0,339.0,122.0,92.0,74.0,37.0,14043.0,306.0,132.0,365.0,365.0,366.0,366.0,366.0


### CDC Data

Want to import the CDC data and compare the counties.

In [120]:
cdc = pd.read_csv('data/Natality_by_year_2007-2021.txt', sep='\t')

In [121]:
cdc.head()

Unnamed: 0,Notes,State,State Code,County,County Code,Infant Birth Weight 12,Infant Birth Weight 12 Code,Year,Year Code,Births,% of Total Births,Average Birth Weight,Average LMP Gestational Age,Average OE Gestational Age
0,,Alabama,1.0,"Baldwin County, AL",1003.0,500 - 999 grams,2.0,2014.0,2014.0,10.0,0.00%,814.9,26.8,26.9
1,,Alabama,1.0,"Baldwin County, AL",1003.0,500 - 999 grams,2.0,2015.0,2015.0,15.0,0.00%,794.13,25.87,25.87
2,,Alabama,1.0,"Baldwin County, AL",1003.0,500 - 999 grams,2.0,2016.0,2016.0,10.0,0.00%,712.5,25.3,25.2
3,,Alabama,1.0,"Baldwin County, AL",1003.0,1000 - 1499 grams,3.0,2014.0,2014.0,10.0,0.00%,1323.7,28.9,29.2
4,,Alabama,1.0,"Baldwin County, AL",1003.0,1000 - 1499 grams,3.0,2015.0,2015.0,18.0,0.00%,1226.56,30.33,30.0


In [122]:
cdc.columns

Index(['Notes', 'State', 'State Code', 'County', 'County Code',
       'Infant Birth Weight 12', 'Infant Birth Weight 12 Code', 'Year',
       'Year Code', 'Births', '% of Total Births', 'Average Birth Weight',
       'Average LMP Gestational Age', 'Average OE Gestational Age'],
      dtype='object')

In [123]:
cdc.columns = [col.lower().replace(' ', '_') for col in cdc.columns]

Renaming the % of births column, and if we want different names for other columns, we can do it at this step

In [124]:
cdc.rename(columns={'%_of_total_births': 'pct_tot_births'
                   }, inplace=True)

In [125]:
cdc.columns

Index(['notes', 'state', 'state_code', 'county', 'county_code',
       'infant_birth_weight_12', 'infant_birth_weight_12_code', 'year',
       'year_code', 'births', 'pct_tot_births', 'average_birth_weight',
       'average_lmp_gestational_age', 'average_oe_gestational_age'],
      dtype='object')

In [126]:
cdc['notes'].nunique()

95

In [127]:
# cdc['notes'].unique()

All the notes look like they are what shows up at the bottom of the file and isn't data that we're looking for.  Dropping the notes column

In [128]:
cdc.drop(['notes'], axis=1, inplace=True)

In [129]:
cdc.isna().sum()[cdc.isna().sum() > 0]

state                          100
state_code                     100
county                         100
county_code                    100
infant_birth_weight_12         100
infant_birth_weight_12_code    100
year                           100
year_code                      100
births                         100
pct_tot_births                 100
average_birth_weight           100
average_lmp_gestational_age    100
average_oe_gestational_age     100
dtype: int64

In [130]:
cdc.tail()

Unnamed: 0,state,state_code,county,county_code,infant_birth_weight_12,infant_birth_weight_12_code,year,year_code,births,pct_tot_births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
21309,,,,,,,,,,,,,
21310,,,,,,,,,,,,,
21311,,,,,,,,,,,,,
21312,,,,,,,,,,,,,
21313,,,,,,,,,,,,,


We need state and county information for this project,and it looks like all these null values belong, possibly to rows that were holding that notes data.  Dropping these rows to see if that clears up all the nulls.

Drop the rows that have the footnotes in them

In [131]:
cdc.dropna(subset=['state'], inplace=True)

In [132]:
cdc.isna().sum()[cdc.isna().sum() > 0]

Series([], dtype: int64)

That did resolve all the null values.

In [133]:
cdc.tail(3)

Unnamed: 0,state,state_code,county,county_code,infant_birth_weight_12,infant_birth_weight_12_code,year,year_code,births,pct_tot_births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
21211,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2019.0,2019.0,234.0,0.01%,2324.8,36.53,36.47
21212,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2020.0,2020.0,232.0,0.01%,2302.12,36.78,36.43
21213,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2021.0,2021.0,230.0,0.01%,2303.27,36.19,36.3


It looks like year and year_code might contain the same data

In [134]:
(cdc['year'] == cdc['year_code']).sum() == cdc.shape[0]

True

`year` and `year_code` appear to be the same.  We also don't have state or county codes in the EPA data, so dropping those as well.

In [135]:
cdc.drop(['year_code', 'county_code', 'state_code'], axis=1, inplace=True)

In [136]:
cdc.columns

Index(['state', 'county', 'infant_birth_weight_12',
       'infant_birth_weight_12_code', 'year', 'births', 'pct_tot_births',
       'average_birth_weight', 'average_lmp_gestational_age',
       'average_oe_gestational_age'],
      dtype='object')

#### Gestational Age at Birth
Beginning in 2014 NCHS changed the standard for gestational period from the Last Menstrual Period (LMP) based gestational age to the Obstetric/clinical Estimate (OE) based gestational age. Obstetric/clinical Estimate (OE) based gestational age groups are available for years 2007 and later in WONDER since February 2016. Refer to [Measuring Gestational Age in Vital Statistics Data: Transitioning to the Obstetric Estimate](http://www.cdc.gov/nchs/data/nvsr/nvsr64/nvsr64_05.pdf) for more information.

I believe this means this means we should drop the LMP column and keep the OE column.

In [137]:
cdc.drop(['average_lmp_gestational_age'], axis=1, inplace=True)

In [138]:
cdc.columns

Index(['state', 'county', 'infant_birth_weight_12',
       'infant_birth_weight_12_code', 'year', 'births', 'pct_tot_births',
       'average_birth_weight', 'average_oe_gestational_age'],
      dtype='object')

#### Birth Weight Columns

There are two columns for the birth weight categories:
* `infant_birth_weight_12`
* `infant_brirth_weight_12_code`

So I will drop the code column and rename the remaining column to reduce length.  I'll also relabel the `average_birth_weight` column to reduce length and clean up the gestational age name as well.

In [139]:
cdc.drop(['infant_birth_weight_12_code'], axis=1, inplace=True)

In [140]:
cdc.rename(columns={
    'infant_birth_weight_12': 'weight_cat',
    'average_birth_weight': 'avg_birth_weight',
    'average_oe_gestational_age': 'ave_gest_age'
    }, inplace=True)

In [141]:
cdc.head()

Unnamed: 0,state,county,weight_cat,year,births,pct_tot_births,avg_birth_weight,ave_gest_age
0,Alabama,"Baldwin County, AL",500 - 999 grams,2014.0,10.0,0.00%,814.9,26.9
1,Alabama,"Baldwin County, AL",500 - 999 grams,2015.0,15.0,0.00%,794.13,25.87
2,Alabama,"Baldwin County, AL",500 - 999 grams,2016.0,10.0,0.00%,712.5,25.2
3,Alabama,"Baldwin County, AL",1000 - 1499 grams,2014.0,10.0,0.00%,1323.7,29.2
4,Alabama,"Baldwin County, AL",1000 - 1499 grams,2015.0,18.0,0.00%,1226.56,30.0


#### County Names

In [142]:
cdc['county'].sample(10)

10880    Unidentified Counties, MS
7753           Woodbury County, IA
21128    Unidentified Counties, WI
16130         Clackamas County, OR
3274               Mesa County, CO
16804            Lehigh County, PA
20123        Newport News city, VA
6688               Will County, IL
13637    Unidentified Counties, NY
6038             Canyon County, ID
Name: county, dtype: object

I ran the sample code above multiple times to see some of the data in the county column.  Looks like they are including the state abbreviation in the County name, so will remove that.  There were also some places that didn't say "County" and said "Counties" (i.e. Unidentified Counties, WY) and also there were some locations that were parishes (in LA) or other unique monikers like Borough.  So I'm splitting on both the " Count" to capture County and Counties as well as "," to get both cases.

We should look at what the AQI data has and decide whether we need to treat the remaining unique naming cases that remain after this clean up.

In [143]:
cdc['county'] = cdc['county'
                   ].apply(lambda x: x.split(' Count')[0].split(',')[0])

In [144]:
cdc[['state', 'county']].nunique()

state      50
county    466
dtype: int64

In [145]:
cdc['county'].sample(10)

9397       Washington
4141         Escambia
3429     Unidentified
5904     Unidentified
6088     Unidentified
1516         Monterey
18662         Midland
10136          Macomb
3977        Charlotte
19734    Unidentified
Name: county, dtype: object

This looks more like what I would expect where it is just the name of the county.  Since we also saw that there were unidentified counties, and they are all just listed as Unidentified now, we should drop that data since we are trying to investigate county air quality and county birth weights, so without a specific county, the data is not valuable to us.

In [146]:
cdc.shape

(21214, 8)

In [147]:
cdc.drop(cdc[cdc['county'] == 'Unidentified'].index, inplace=True)

In [148]:
cdc.shape

(19028, 8)

In [149]:
cdc.dtypes

state                object
county               object
weight_cat           object
year                float64
births              float64
pct_tot_births       object
avg_birth_weight    float64
ave_gest_age        float64
dtype: object

Most of the datatypes look correct, but the `pct_tot_births` is an object and I believe this should be a float, so will investigate that a bit further.

In [150]:
cdc['pct_tot_births'].value_counts()

0.00%    15428
0.01%     2551
0.02%      591
0.03%      191
0.05%       84
0.04%       70
0.06%       37
0.07%       25
0.08%       10
0.12%       10
0.13%        9
0.21%        3
0.15%        2
0.09%        2
0.11%        2
0.14%        2
0.20%        2
0.26%        2
0.18%        1
0.17%        1
0.19%        1
0.23%        1
0.10%        1
0.24%        1
0.22%        1
Name: pct_tot_births, dtype: int64

In [151]:
cdc['pct_tot_births'] = cdc['pct_tot_births'
                           ].apply(lambda x: x[:4])

In [160]:
cdc['pct_tot_births'] = cdc['pct_tot_births'].astype(float)

In [161]:
cdc.dtypes

state                object
county               object
weight_cat           object
year                float64
births              float64
pct_tot_births      float64
avg_birth_weight    float64
ave_gest_age        float64
dtype: object

In [162]:
cdc.head()

Unnamed: 0,state,county,weight_cat,year,births,pct_tot_births,avg_birth_weight,ave_gest_age
0,Alabama,Baldwin,500 - 999 grams,2014.0,10.0,0.0,814.9,26.9
1,Alabama,Baldwin,500 - 999 grams,2015.0,15.0,0.0,794.13,25.87
2,Alabama,Baldwin,500 - 999 grams,2016.0,10.0,0.0,712.5,25.2
3,Alabama,Baldwin,1000 - 1499 grams,2014.0,10.0,0.0,1323.7,29.2
4,Alabama,Baldwin,1000 - 1499 grams,2015.0,18.0,0.0,1226.56,30.0


### Export Data

In [163]:
cdc.to_csv('data/cdc_cleaned.csv', index=False)

## PARKING LOT

In [153]:
# how many counties from the aqi are also in the cdc data
# len(set(aqi['county']).intersection(set(cdc['County'])))

In [154]:
# what are the differences
#set(aqi['county']).difference(set(cdc['County']))

### EPA API Information

Look to see if getting more granular data from the API is feasible

Your user ID is your email address: mrhurless@gmail.com 
Your key is: rubymouse94 

In [155]:
# import requests

In [156]:
# email = 'mrhurless@gmail.com'
# key = 'rubymouse94'

# url = f'https://aqs.epa.gov/data/api/list/parametersByClass?email={email}&key={key}&pc=CRITERIA'

# res = requests.get(url)

# res.status_code

In [157]:
# # from https://stackoverflow.com/questions/71603314/ssl-error-unsafe-legacy-renegotiation-disabled

# import urllib3
# import ssl

# class CustomHttpAdapter (requests.adapters.HTTPAdapter):
#     # "Transport adapter" that allows us to use custom ssl_context.

#     def __init__(self, ssl_context=None, **kwargs):
#         self.ssl_context = ssl_context
#         super().__init__(**kwargs)

#     def init_poolmanager(self, connections, maxsize, block=False):
#         self.poolmanager = urllib3.poolmanager.PoolManager(
#             num_pools=connections, maxsize=maxsize,
#             block=block, ssl_context=self.ssl_context)


# def get_legacy_session():
#     ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
#     ctx.options |= 0x4  # OP_LEGACY_SERVER_CONNECT
#     session = requests.session()
#     session.mount('https://', CustomHttpAdapter(ctx))
#     return session

In [158]:
# res = get_legacy_session().get(url)

In [159]:
def get_aqi():
    creds = ('mhurless@me.com', '0n3L0v3#')
    email = 'mrhurless@gmail.com'
    key = 'rubymouse94'
    all_posts = []
    
    url = "https://support.brightsign.biz/api/v2/community/posts"
    
    res = requests.get(url, auth=creds)
    
    # get data other than posts data
    metadata = {
                key: value for key, value in res.json().items() 
                if key != 'posts'
            }
    
    last_page = metadata['page_count']
    current_page = metadata['page']
    
    #count = 0 #keep track of posts # don't think this is needed with this
    # approach

    # goal is to get posts from all pages, so we'll check if we've reached 
    # the last page, but will also put in a catch for the rate limit as 
    # mentioned by the API docs
    
    while current_page <= last_page:
        res = requests.get(url, auth=creds)

        if res.status_code == 200:
            posts = pd.DataFrame(res.json()['posts'])
            metadata = {
                key: value for key, value in res.json().items() 
                if key != 'posts'
            }
            
            # update current page
            current_page = metadata['page']
            
            # update URL to pull posts from next page
            url = metadata['next_page']

            all_posts.append(posts)
            
            #sleep(5)

            if metadata['page'] == metadata['page_count']: 
                break #break loop if last page is reached

            #get sequential posts from most recent to least    
            #params['before'] = posts['created_utc'].min()
        elif res.status_code == 429:
            sleep(res.headers['retry-after'])
        else:
            print(f'status: {res.status_code}')  
    print(f'posts retrieved from API: {len(all_posts)}')

    return pd.concat(all_posts)