# Data Collection and Cleaning

Data has been collected from both the EPA for Air Quality Index data and from the CDC for birth data relating to weight.  

## Imports

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

The air quality data was downloaded from https://aqs.epa.gov/aqsweb/airdata/download_files.html#Annual as files containing annual data from years 2016-2021.  We are combining these CSV files into a single dataframe and will evaluate whether it will meet the needs for the project, or whether data will need to be collected through the API that the EPA offers for AirData.

In [100]:
# code from Winston merged with approach from
# https://towardsdatascience.com/pandas-concat-tricks-you-should-know-to-speed-up-your-data-analysis-cd3d4fdfe6dd

data_path = Path('data/')

# create a list of all the AQI csv files
dfs = (
    pd.read_csv(file) for file in data_path.glob('*aqi*') if file.is_file()
)

# concatenate the dataframes
res = pd.concat(dfs)

# export the final csv
res.to_csv('data/aqi_by_year_2006-2021.csv', index=False)

Read the final CSV for all the AQI data

In [101]:
aqi = pd.read_csv('data/aqi_by_year_2006-2021.csv')

In [102]:
aqi.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,2009,252,218,32,2,0,0,0,136,53,36,0,0,200,52,0
1,Alabama,Clay,2009,119,97,22,0,0,0,0,94,59,33,0,0,0,119,0
2,Alabama,Colbert,2009,323,220,103,0,0,0,0,76,60,43,0,0,132,191,0
3,Alabama,DeKalb,2009,363,311,52,0,0,0,0,100,54,36,0,0,308,55,0
4,Alabama,Elmore,2009,244,228,16,0,0,0,0,80,49,36,0,0,244,0,0


In [103]:
aqi.columns

Index(['State', 'County', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days PM2.5', 'Days PM10'],
      dtype='object')

In [104]:
aqi.shape

(33502, 18)

In [105]:
aqi.dtypes

State                                  object
County                                 object
Year                                    int64
Days with AQI                           int64
Good Days                               int64
Moderate Days                           int64
Unhealthy for Sensitive Groups Days     int64
Unhealthy Days                          int64
Very Unhealthy Days                     int64
Hazardous Days                          int64
Max AQI                                 int64
90th Percentile AQI                     int64
Median AQI                              int64
Days CO                                 int64
Days NO2                                int64
Days Ozone                              int64
Days PM2.5                              int64
Days PM10                               int64
dtype: object

In [106]:
aqi.columns = [col.lower().replace(' ', '_') for col in aqi.columns]

In [107]:
aqi.columns

Index(['state', 'county', 'year', 'days_with_aqi', 'good_days',
       'moderate_days', 'unhealthy_for_sensitive_groups_days',
       'unhealthy_days', 'very_unhealthy_days', 'hazardous_days', 'max_aqi',
       '90th_percentile_aqi', 'median_aqi', 'days_co', 'days_no2',
       'days_ozone', 'days_pm2.5', 'days_pm10'],
      dtype='object')

In [108]:
aqi.isnull().sum()

state                                  0
county                                 0
year                                   0
days_with_aqi                          0
good_days                              0
moderate_days                          0
unhealthy_for_sensitive_groups_days    0
unhealthy_days                         0
very_unhealthy_days                    0
hazardous_days                         0
max_aqi                                0
90th_percentile_aqi                    0
median_aqi                             0
days_co                                0
days_no2                               0
days_ozone                             0
days_pm2.5                             0
days_pm10                              0
dtype: int64

In [109]:
aqi[['state', 'county']].nunique()

state      55
county    913
dtype: int64

In [110]:
aqi['state'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Country Of Mexico', 'Delaware',
       'District Of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virgin Islands', 'Virginia', 'Washington', 'West Virginia',
       'Wisconsin', 'Wyoming', 'Canada'], dtype=object)

May want to drop locations such as:
* Country of Mexico
* Puerto Rico
* Virgin Islands
* Canada

In [111]:
aqi.describe()

Unnamed: 0,year,days_with_aqi,good_days,moderate_days,unhealthy_for_sensitive_groups_days,unhealthy_days,very_unhealthy_days,hazardous_days,max_aqi,90th_percentile_aqi,median_aqi,days_co,days_no2,days_ozone,days_pm2.5,days_pm10
count,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0,33502.0
mean,2013.378843,304.81822,233.813563,64.906215,5.033132,0.920542,0.095994,0.048773,127.18375,61.495553,37.474599,1.425766,6.699361,169.352039,112.846815,14.494239
std,4.607776,91.250699,83.797024,51.877028,10.433271,4.05487,1.104159,0.679988,221.866188,19.566392,11.121959,15.505999,26.626984,119.843147,110.158979,53.624707
min,2006.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2009.0,248.0,176.0,25.0,0.0,0.0,0.0,0.0,89.0,50.0,33.0,0.0,0.0,0.0,0.0,0.0
50%,2013.0,360.0,249.0,53.0,1.0,0.0,0.0,0.0,112.0,59.0,39.0,0.0,0.0,185.0,93.0,0.0
75%,2017.0,365.0,304.0,93.0,6.0,0.0,0.0,0.0,145.0,71.0,44.0,0.0,0.0,245.0,183.0,1.0
max,2021.0,366.0,365.0,339.0,122.0,92.0,74.0,37.0,14043.0,306.0,132.0,365.0,365.0,366.0,366.0,366.0


Want to import the CDC data and compare the counties.

In [112]:
cdc = pd.read_csv('data/Natality_by_year_2007-2021.txt', sep='\t')

In [113]:
cdc.head()

Unnamed: 0,Notes,State,State Code,County,County Code,Infant Birth Weight 12,Infant Birth Weight 12 Code,Year,Year Code,Births,% of Total Births,Average Birth Weight,Average LMP Gestational Age,Average OE Gestational Age
0,,Alabama,1.0,"Baldwin County, AL",1003.0,500 - 999 grams,2.0,2014.0,2014.0,10.0,0.00%,814.9,26.8,26.9
1,,Alabama,1.0,"Baldwin County, AL",1003.0,500 - 999 grams,2.0,2015.0,2015.0,15.0,0.00%,794.13,25.87,25.87
2,,Alabama,1.0,"Baldwin County, AL",1003.0,500 - 999 grams,2.0,2016.0,2016.0,10.0,0.00%,712.5,25.3,25.2
3,,Alabama,1.0,"Baldwin County, AL",1003.0,1000 - 1499 grams,3.0,2014.0,2014.0,10.0,0.00%,1323.7,28.9,29.2
4,,Alabama,1.0,"Baldwin County, AL",1003.0,1000 - 1499 grams,3.0,2015.0,2015.0,18.0,0.00%,1226.56,30.33,30.0


Drop the rows that have the footnotes in them

In [114]:
cdc.drop(cdc[cdc['State'].isnull()].index, inplace=True)

In [115]:
cdc.tail()

Unnamed: 0,Notes,State,State Code,County,County Code,Infant Birth Weight 12,Infant Birth Weight 12 Code,Year,Year Code,Births,% of Total Births,Average Birth Weight,Average LMP Gestational Age,Average OE Gestational Age
21209,,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2017.0,2017.0,230.0,0.01%,2312.23,36.6,36.52
21210,,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2018.0,2018.0,229.0,0.01%,2300.62,36.81,36.59
21211,,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2019.0,2019.0,234.0,0.01%,2324.8,36.53,36.47
21212,,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2020.0,2020.0,232.0,0.01%,2302.12,36.78,36.43
21213,,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2000 - 2499 grams,5.0,2021.0,2021.0,230.0,0.01%,2303.27,36.19,36.3


In [116]:
cdc[['State', 'County']].nunique()

State      50
County    611
dtype: int64

In [117]:
cdc['County'].tail()

21209    Unidentified Counties, WY
21210    Unidentified Counties, WY
21211    Unidentified Counties, WY
21212    Unidentified Counties, WY
21213    Unidentified Counties, WY
Name: County, dtype: object

Looks like they are including the state abbreviation in the County name, so will remove that.  There were also some places that didn't say "County" and said "Counties" (i.e. Unidentified Counties, WY) and also there were some locations that were parishes (in LA) or other unique.  So I'm splitting on both the " Count" to capture County and Counties as well as "," to get both cases.

In [122]:
cdc['County'] = cdc['County'].apply(lambda x: x.split(' Count')[0].split(',')[0])

In [127]:
# what is the difference in county data
len(set(cdc['County']) - set(aqi['county']))

58

In [128]:
set(cdc['County']) - set(aqi['county'])

{'Alexandria city',
 'Anchorage Borough',
 'Ascension Parish',
 'Baltimore city',
 'Bartow',
 'Bossier Parish',
 'Broome',
 'Brunswick',
 'Cabarrus',
 'Caddo Parish',
 'Calcasieu Parish',
 'Calhoun',
 'Charlotte',
 'Chesapeake city',
 'Comal',
 'Craven',
 'East Baton Rouge Parish',
 'Eaton',
 'Fort Bend',
 'Grayson',
 'Guadalupe',
 'Hampton city',
 'Harnett',
 'Hernando',
 'Iredell',
 'Jefferson Parish',
 'Kankakee',
 'Kendall',
 'La Porte',
 'LaSalle',
 'Lafayette Parish',
 'Livingston Parish',
 'Midland',
 'Newport News city',
 'Norfolk city',
 'Ontario',
 'Orleans Parish',
 'Ouachita Parish',
 'Portsmouth city',
 'Rankin',
 'Rapides Parish',
 'Richmond city',
 'Saginaw',
 'Saline',
 'Spotsylvania',
 'St. Johns',
 'St. Louis',
 'St. Louis city',
 "St. Mary's",
 'St. Tammany Parish',
 'Strafford',
 'Tangipahoa Parish',
 'Tazewell',
 'Terrebonne Parish',
 'Tom Green',
 'Unidentified',
 'Virginia Beach city',
 'Whitfield'}