# Data Collection and Cleaning

Data has been collected from both the EPA for Air Quality Index data and from the CDC for birth data relating to weight.  

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

## Data

### EPA AQI Data

The air quality data was downloaded from https://aqs.epa.gov/aqsweb/airdata/download_files.html#Annual as files containing annual data from years 2016-2021.  We are combining these CSV files into a single dataframe and will evaluate whether it will meet the needs for the project, or whether data will need to be collected through the API that the EPA offers for AirData.

In [290]:
# code from Winston merged with approach from
# https://towardsdatascience.com/pandas-concat-tricks-you-should-know-to-speed-up-your-data-analysis-cd3d4fdfe6dd

data_path = Path('data/')

# create a list of all the AQI csv files
dfs = (
    pd.read_csv(file) for file in data_path.glob('*aqi*') if file.is_file()
)

# concatenate the dataframes
res = pd.concat(dfs)

# export the final csv
res.to_csv('data/aqi_by_year_2006-2021.csv', index=False)

Read the final CSV for all the AQI data

In [2]:
aqi = pd.read_csv('data/aqi_by_year_2006-2021.csv')

In [3]:
aqi.head()

Unnamed: 0,state,county,year,days_with_aqi,good_days,moderate_days,unhealthy_for_sensitive_groups_days,unhealthy_days,very_unhealthy_days,hazardous_days,max_aqi,90th_percentile_aqi,median_aqi,days_co,days_no2,days_ozone,days_pm2.5,days_pm10
0,Alabama,Baldwin,2006,280,174,81,24,1,0,0,156,97,46,0,0,219,61,0
1,Alabama,Barbour,2006,11,6,5,0,0,0,0,77,63,35,0,0,0,11,0
2,Alabama,Clay,2006,286,165,112,9,0,0,0,140,80,48,0,0,201,85,0
3,Alabama,Colbert,2006,282,180,95,7,0,0,0,129,74,45,0,0,191,91,0
4,Alabama,DeKalb,2006,363,215,133,15,0,0,0,140,84,47,0,0,303,60,0


In [4]:
aqi.columns

Index(['state', 'county', 'year', 'days_with_aqi', 'good_days',
       'moderate_days', 'unhealthy_for_sensitive_groups_days',
       'unhealthy_days', 'very_unhealthy_days', 'hazardous_days', 'max_aqi',
       '90th_percentile_aqi', 'median_aqi', 'days_co', 'days_no2',
       'days_ozone', 'days_pm2.5', 'days_pm10'],
      dtype='object')

In [5]:
aqi.shape

(16751, 18)

In [6]:
aqi.dtypes

state                                  object
county                                 object
year                                    int64
days_with_aqi                           int64
good_days                               int64
moderate_days                           int64
unhealthy_for_sensitive_groups_days     int64
unhealthy_days                          int64
very_unhealthy_days                     int64
hazardous_days                          int64
max_aqi                                 int64
90th_percentile_aqi                     int64
median_aqi                              int64
days_co                                 int64
days_no2                                int64
days_ozone                              int64
days_pm2.5                              int64
days_pm10                               int64
dtype: object

In [296]:
aqi.columns = [col.lower().replace(' ', '_') for col in aqi.columns]

In [297]:
aqi.columns

Index(['state', 'county', 'year', 'days_with_aqi', 'good_days',
       'moderate_days', 'unhealthy_for_sensitive_groups_days',
       'unhealthy_days', 'very_unhealthy_days', 'hazardous_days', 'max_aqi',
       '90th_percentile_aqi', 'median_aqi', 'days_co', 'days_no2',
       'days_ozone', 'days_pm2.5', 'days_pm10'],
      dtype='object')

In [7]:
aqi.isnull().sum()

state                                  0
county                                 0
year                                   0
days_with_aqi                          0
good_days                              0
moderate_days                          0
unhealthy_for_sensitive_groups_days    0
unhealthy_days                         0
very_unhealthy_days                    0
hazardous_days                         0
max_aqi                                0
90th_percentile_aqi                    0
median_aqi                             0
days_co                                0
days_no2                               0
days_ozone                             0
days_pm2.5                             0
days_pm10                              0
dtype: int64

In [8]:
aqi[['state', 'county']].nunique()

state      55
county    913
dtype: int64

In [300]:
aqi['state'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Country Of Mexico', 'Delaware',
       'District Of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virgin Islands', 'Virginia', 'Washington', 'West Virginia',
       'Wisconsin', 'Wyoming', 'Canada'], dtype=object)

May want to drop locations such as:
* Country of Mexico
* Puerto Rico
* Virgin Islands
* Canada

In [484]:
# aqi['county'].unique()

In [302]:
aqi.describe()

Unnamed: 0,year,days_with_aqi,good_days,moderate_days,unhealthy_for_sensitive_groups_days,unhealthy_days,very_unhealthy_days,hazardous_days,max_aqi,90th_percentile_aqi,median_aqi,days_co,days_no2,days_ozone,days_pm2.5,days_pm10
count,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0,67004.0
mean,2013.378843,304.81822,233.813563,64.906215,5.033132,0.920542,0.095994,0.048773,127.18375,61.495553,37.474599,1.425766,6.699361,169.352039,112.846815,14.494239
std,4.607741,91.250018,83.796399,51.876641,10.433193,4.05484,1.10415,0.679983,221.864532,19.566246,11.121876,15.505884,26.626785,119.842252,110.158157,53.624306
min,2006.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2009.0,248.0,176.0,25.0,0.0,0.0,0.0,0.0,89.0,50.0,33.0,0.0,0.0,0.0,0.0,0.0
50%,2013.0,360.0,249.0,53.0,1.0,0.0,0.0,0.0,112.0,59.0,39.0,0.0,0.0,185.0,93.0,0.0
75%,2017.0,365.0,304.0,93.0,6.0,0.0,0.0,0.0,145.0,71.0,44.0,0.0,0.0,245.0,183.0,1.0
max,2021.0,366.0,365.0,339.0,122.0,92.0,74.0,37.0,14043.0,306.0,132.0,365.0,365.0,366.0,366.0,366.0


### CDC Data

Want to import the CDC data and compare the counties.

We are using the WONDER tool from the CDC to export the files that we are importing to this notebook. For both files, we filtered out any births with maternal risk factors so we they are not influencing birth weight.  There are some years that have no or suppressed data, which are omitted from the CDC export and are not present.We filtered out any births with maternal risk factors so we they are not influencing birth weight.  There are some years that have no or suppressed data, which are omitted from the CDC export and are not present. 

There are two main files:
1. `Natality_low_2007-2021` which represents all the births below 2500 grams by year and county. 
1. `Natality_all_2007-2021` which represents all the births by year and county.

#### Import Data

In [9]:
cdc_low = pd.read_csv('data/Natality_low_2007-2021.txt', sep='\t')
cdc_all = pd.read_csv('data/Natality_all_2007-2021.txt', sep='\t')

#### General Look and Cleanup

In [10]:
cdc_low.head()

Unnamed: 0,Notes,State,State Code,County,County Code,Year,Year Code,Births,Average Birth Weight,Average LMP Gestational Age,Average OE Gestational Age
0,,Alabama,1.0,"Baldwin County, AL",1003.0,2014.0,2014.0,137.0,1995.255,34.241,34.051
1,,Alabama,1.0,"Baldwin County, AL",1003.0,2015.0,2015.0,141.0,1832.837,32.965,32.922
2,,Alabama,1.0,"Baldwin County, AL",1003.0,2016.0,2016.0,120.0,2008.358,34.467,34.058
3,,Alabama,1.0,"Baldwin County, AL",1003.0,2017.0,2017.0,100.0,2082.83,34.55,34.38
4,,Alabama,1.0,"Baldwin County, AL",1003.0,2018.0,2018.0,122.0,1942.303,33.762,33.508


In [11]:
cdc_all.head()

Unnamed: 0,Notes,State,State Code,County,County Code,Year,Year Code,Births,Average Birth Weight,Average LMP Gestational Age,Average OE Gestational Age
0,,Alabama,1.0,"Baldwin County, AL",1003.0,2014.0,2014.0,1724.0,3296.472,38.595,38.489
1,,Alabama,1.0,"Baldwin County, AL",1003.0,2015.0,2015.0,1857.0,3290.811,38.487,38.347
2,,Alabama,1.0,"Baldwin County, AL",1003.0,2016.0,2016.0,1722.0,3315.387,38.69,38.497
3,,Alabama,1.0,"Baldwin County, AL",1003.0,2017.0,2017.0,1782.0,3346.588,38.713,38.572
4,,Alabama,1.0,"Baldwin County, AL",1003.0,2018.0,2018.0,1770.0,3301.324,38.58,38.423


In [12]:
cdc_low.dtypes

Notes                           object
State                           object
State Code                     float64
County                          object
County Code                    float64
Year                           float64
Year Code                      float64
Births                         float64
Average Birth Weight           float64
Average LMP Gestational Age    float64
Average OE Gestational Age     float64
dtype: object

In [13]:
cdc_all.dtypes

Notes                           object
State                           object
State Code                     float64
County                          object
County Code                    float64
Year                           float64
Year Code                      float64
Births                         float64
Average Birth Weight           float64
Average LMP Gestational Age    float64
Average OE Gestational Age     float64
dtype: object

In [14]:
cdc_low.shape

(6712, 11)

In [15]:
cdc_all.shape

(6715, 11)

In [16]:
cdc_low.columns, cdc_all.columns

(Index(['Notes', 'State', 'State Code', 'County', 'County Code', 'Year',
        'Year Code', 'Births', 'Average Birth Weight',
        'Average LMP Gestational Age', 'Average OE Gestational Age'],
       dtype='object'),
 Index(['Notes', 'State', 'State Code', 'County', 'County Code', 'Year',
        'Year Code', 'Births', 'Average Birth Weight',
        'Average LMP Gestational Age', 'Average OE Gestational Age'],
       dtype='object'))

In [17]:
cdc_low.columns = [col.lower().replace(' ', '_') for col in cdc_low.columns]
cdc_all.columns = [col.lower().replace(' ', '_') for col in cdc_all.columns]

Renaming the % of births column, and if we want different names for other columns, we can do it at this step

In [18]:
cdc_low.columns

Index(['notes', 'state', 'state_code', 'county', 'county_code', 'year',
       'year_code', 'births', 'average_birth_weight',
       'average_lmp_gestational_age', 'average_oe_gestational_age'],
      dtype='object')

In [19]:
cdc_low['notes'].nunique(), cdc_all['notes'].nunique()

(91, 87)

In [20]:
# cdc_all['notes'].unique()

All the notes look like they are what shows up at the bottom of the file and isn't data that we're looking for.  Dropping the notes column

In [21]:
cdc_low.drop(['notes'], axis=1, inplace=True)
cdc_all.drop(['notes'], axis=1, inplace=True)

In [22]:
cdc_low.isna().sum()[cdc_low.isna().sum() > 0]

state                          96
state_code                     96
county                         96
county_code                    96
year                           96
year_code                      96
births                         96
average_birth_weight           96
average_lmp_gestational_age    96
average_oe_gestational_age     96
dtype: int64

In [23]:
cdc_all.isna().sum()[cdc_all.isna().sum() > 0]

state                          92
state_code                     92
county                         92
county_code                    92
year                           92
year_code                      92
births                         92
average_birth_weight           92
average_lmp_gestational_age    92
average_oe_gestational_age     92
dtype: int64

In [24]:
cdc_low.tail(3)

Unnamed: 0,state,state_code,county,county_code,year,year_code,births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
6709,,,,,,,,,,
6710,,,,,,,,,,
6711,,,,,,,,,,


In [25]:
cdc_all.tail(3)

Unnamed: 0,state,state_code,county,county_code,year,year_code,births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
6712,,,,,,,,,,
6713,,,,,,,,,,
6714,,,,,,,,,,


We need state and county information for this project,and it looks like all these null values belong, possibly to rows that were holding that notes data.  Dropping these rows to see if that clears up all the nulls.

Drop the rows that have the footnotes in them

In [26]:
cdc_low.dropna(subset=['state'], inplace=True)
cdc_all.dropna(subset=['state'], inplace=True)

In [27]:
cdc_low.isna().sum()[cdc_low.isna().sum() > 0]

Series([], dtype: int64)

In [28]:
cdc_all.isna().sum()[cdc_all.isna().sum() > 0]

Series([], dtype: int64)

That did resolve all the null values.

In [29]:
cdc_low.shape, cdc_all.shape

((6616, 10), (6623, 10))

In [30]:
cdc_low.tail(3)

Unnamed: 0,state,state_code,county,county_code,year,year_code,births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
6613,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2019.0,2019.0,335.0,2062.824,35.051,34.919
6614,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2020.0,2020.0,307.0,2088.586,35.365,34.987
6615,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2021.0,2021.0,319.0,2063.197,34.821,34.862


In [31]:
cdc_all.tail(3)

Unnamed: 0,state,state_code,county,county_code,year,year_code,births,average_birth_weight,average_lmp_gestational_age,average_oe_gestational_age
6620,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2019.0,2019.0,4592.0,3218.04,38.709,38.636
6621,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2020.0,2020.0,4285.0,3228.561,38.733,38.641
6622,Wyoming,56.0,"Unidentified Counties, WY",56999.0,2021.0,2021.0,4379.0,3227.242,38.608,38.551


It looks like year and year_code might contain the same data

In [32]:
(cdc_low['year'] == cdc_low['year_code']).sum() == cdc_low.shape[0]

True

`year` and `year_code` appear to be the same.  We also don't have state or county codes in the EPA data, so dropping those as well.

In [33]:
cdc_low.drop(['year_code', 'county_code', 'state_code'], axis=1, inplace=True)
cdc_all.drop(['year_code', 'county_code', 'state_code'], axis=1, inplace=True)

Also want the year as an int not a float

In [34]:
cdc_low['year'] = cdc_low['year'].astype(int)
cdc_all['year'] = cdc_all['year'].astype(int)

In [35]:
cdc_low.columns

Index(['state', 'county', 'year', 'births', 'average_birth_weight',
       'average_lmp_gestational_age', 'average_oe_gestational_age'],
      dtype='object')

#### Gestational Age at Birth
Beginning in 2014 NCHS changed the standard for gestational period from the Last Menstrual Period (LMP) based gestational age to the Obstetric/clinical Estimate (OE) based gestational age. Obstetric/clinical Estimate (OE) based gestational age groups are available for years 2007 and later in WONDER since February 2016. Refer to [Measuring Gestational Age in Vital Statistics Data: Transitioning to the Obstetric Estimate](http://www.cdc.gov/nchs/data/nvsr/nvsr64/nvsr64_05.pdf) for more information.

I believe this means this means we should drop the LMP column and keep the OE column.

In [38]:
cdc_low.drop(['average_lmp_gestational_age'], axis=1, inplace=True)
cdc_all.drop(['average_lmp_gestational_age'], axis=1, inplace=True)

KeyError: "['average_lmp_gestational_age'] not found in axis"

In [39]:
cdc_low.columns

Index(['state', 'county', 'year', 'births', 'average_birth_weight',
       'average_oe_gestational_age'],
      dtype='object')

#### Rename Columns

To prepare for merging this data, giving the columns specific names to indicate which dataframe they came from.

In [41]:
cdc_low.rename(columns={
    'births': 'births_low',
    'average_birth_weight': 'avg_weight_low',
    'average_oe_gestational_age': 'avg_ges_age_low'
    }, inplace=True)

cdc_all.rename(columns={
    'births': 'births_all',
    'average_birth_weight': 'avg_weight_all',
    'average_oe_gestational_age': 'avg_ges_age_all'
    }, inplace=True)

In [42]:
cdc_low.head(3)

Unnamed: 0,state,county,year,births_low,avg_weight_low,avg_ges_age_low
0,Alabama,"Baldwin County, AL",2014,137.0,1995.255,34.051
1,Alabama,"Baldwin County, AL",2015,141.0,1832.837,32.922
2,Alabama,"Baldwin County, AL",2016,120.0,2008.358,34.058


In [43]:
cdc_all.head(3)

Unnamed: 0,state,county,year,births_all,avg_weight_all,avg_ges_age_all
0,Alabama,"Baldwin County, AL",2014,1724.0,3296.472,38.489
1,Alabama,"Baldwin County, AL",2015,1857.0,3290.811,38.347
2,Alabama,"Baldwin County, AL",2016,1722.0,3315.387,38.497


#### Merge Data

Before further cleaning, the dataframes will be merged.  The low birthweight dataframe `cdc_low` is the main data and we will supplement that with the all birthweight data `cdc_all` so we should end up with a single dataframe that contains 1 entry for each year/county combination that has both the low birthweight and all birthweight data.

In [44]:
cdc = pd.merge(cdc_low, cdc_all, how='left', on=['state', 'county', 'year'])

In [45]:
cdc.head()

Unnamed: 0,state,county,year,births_low,avg_weight_low,avg_ges_age_low,births_all,avg_weight_all,avg_ges_age_all
0,Alabama,"Baldwin County, AL",2014,137.0,1995.255,34.051,1724.0,3296.472,38.489
1,Alabama,"Baldwin County, AL",2015,141.0,1832.837,32.922,1857.0,3290.811,38.347
2,Alabama,"Baldwin County, AL",2016,120.0,2008.358,34.058,1722.0,3315.387,38.497
3,Alabama,"Baldwin County, AL",2017,100.0,2082.83,34.38,1782.0,3346.588,38.572
4,Alabama,"Baldwin County, AL",2018,122.0,1942.303,33.508,1770.0,3301.324,38.423


#### County Names

In [46]:
cdc['county'].sample(10)

2751           Cecil County, MD
2600         Orleans Parish, LA
3077      Livingston County, MI
1590          Fulton County, GA
2838      St. Mary's County, MD
2531           Caddo Parish, LA
6554        Walworth County, WI
4122    St. Lawrence County, NY
2422       Wyandotte County, KS
2578       Lafayette Parish, LA
Name: county, dtype: object

I ran the sample code above multiple times to see some of the data in the county column.  Looks like they are including the state abbreviation in the County name, so will remove that.  There were also some places that didn't say "County" and said "Counties" (i.e. Unidentified Counties, WY) and also there were some locations that were parishes (in LA) or other unique monikers like Borough.  So I'm splitting on both the " Count" to capture County and Counties as well as "," to get both cases.

We should look at what the AQI data has and decide whether we need to treat the remaining unique naming cases that remain after this clean up.

In [584]:
cdc['county'] = cdc['county'
                   ].apply(lambda x: x.split(' Count')[0].split(',')[0].title())

In [585]:
cdc[['state', 'county']].nunique()

state      50
county    466
dtype: int64

In [586]:
cdc['county'].sample(10)

4860           Richland
3125            Saginaw
3751         Bernalillo
6483           Marathon
271             Alameda
2747            Carroll
5369               York
2825    Prince George'S
6254       Norfolk City
2662       Unidentified
Name: county, dtype: object

This looks more like what I would expect where it is just the name of the county.  Since we also saw that there were unidentified counties, and they are all just listed as Unidentified now, we should drop that data since we are trying to investigate county air quality and county birth weights, so without a specific county, the data is not valuable to us.

In [587]:
# looking through all the unique values for anything else interesting
# cdc['county'].unique()

I'm also seeing many counties that end in lower case 'city' so investigating some of those, like:
* St. Louis city
* Chesapeake city
* Norfolk city
* Portsmouth city

St. Louis is interesting, since it sounds like the city is a county, but there is also a St. Louis county that surrounds the city according to https://www.stlouis-mo.gov/government/about/city-government-structure.cfm

Chesapeake, Virginia is also an independent city and not part of a county https://en.wikipedia.org/wiki/Chesapeake,_Virginia

Given this information and the information above about seeing some "parish" listings in Louisiana, we checked against the AQI county listings and found that in the AQI data:
* the word Parish does not appear, but the name of the Parish does--like 'East Baton Rouge', so we should drop 'Parish' from the CDC county names.
* The word City does appear in the AQI data, but it is capitalized, so we should title case the CDC data or make all of them lower case when we go to match them up (this has been incorporated into earlier cleaning steps and is no longer a concern)

In [588]:
# drop parish
cdc['county'] = cdc['county'
                   ].apply(lambda x: x.split(' Parish')[0])

In [589]:
cdc.shape

(6616, 9)

In [590]:
cdc.drop(cdc[cdc['county'] == 'Unidentified'].index, inplace=True)

In [591]:
cdc.shape

(6094, 9)

In [592]:
cdc.dtypes

state               object
county              object
year                 int64
births_low         float64
avg_weight_low     float64
avg_ges_age_low    float64
births_all         float64
avg_weight_all     float64
avg_ges_age_all    float64
dtype: object

#### Calculate Percent of Low Birthweight

We have the total births and the low birthweight births, so let's create a percentage of low birthrate births before exporting the cleaned data.

In [614]:
cdc['pct_low'] = (cdc['births_low'] / cdc['births_all']).round(3)

In [615]:
cdc.head(10)

Unnamed: 0,state,county,year,births_low,avg_weight_low,avg_ges_age_low,births_all,avg_weight_all,avg_ges_age_all,pct_low
0,Alabama,Baldwin,2014,137.0,1995.255,34.051,1724.0,3296.472,38.489,0.079
1,Alabama,Baldwin,2015,141.0,1832.837,32.922,1857.0,3290.811,38.347,0.076
2,Alabama,Baldwin,2016,120.0,2008.358,34.058,1722.0,3315.387,38.497,0.07
3,Alabama,Baldwin,2017,100.0,2082.83,34.38,1782.0,3346.588,38.572,0.056
4,Alabama,Baldwin,2018,122.0,1942.303,33.508,1770.0,3301.324,38.423,0.069
5,Alabama,Baldwin,2019,99.0,2060.768,34.919,1845.0,3327.293,38.521,0.054
6,Alabama,Baldwin,2020,113.0,2025.478,34.982,1763.0,3305.012,38.529,0.064
7,Alabama,Baldwin,2021,120.0,1969.508,34.269,1969.0,3293.984,38.394,0.061
8,Alabama,Calhoun,2014,59.0,1972.661,34.475,948.0,3275.708,38.668,0.062
9,Alabama,Calhoun,2015,65.0,1909.708,33.892,983.0,3267.121,38.59,0.066


### Export Data

In [616]:
cdc.to_csv('data/cdc_cleaned.csv', index=False)

# EDA

## Read Cleaned Data

In [617]:
births = pd.read_csv('data/cdc_cleaned.csv')

In [618]:
births.head()

Unnamed: 0,state,county,year,births_low,avg_weight_low,avg_ges_age_low,births_all,avg_weight_all,avg_ges_age_all,pct_low
0,Alabama,Baldwin,2014,137.0,1995.255,34.051,1724.0,3296.472,38.489,0.079
1,Alabama,Baldwin,2015,141.0,1832.837,32.922,1857.0,3290.811,38.347,0.076
2,Alabama,Baldwin,2016,120.0,2008.358,34.058,1722.0,3315.387,38.497,0.07
3,Alabama,Baldwin,2017,100.0,2082.83,34.38,1782.0,3346.588,38.572,0.056
4,Alabama,Baldwin,2018,122.0,1942.303,33.508,1770.0,3301.324,38.423,0.069


In [597]:
births.shape

(6094, 9)

In [598]:
births.describe()

Unnamed: 0,year,births_low,avg_weight_low,avg_ges_age_low,births_all,avg_weight_all,avg_ges_age_all
count,6094.0,6094.0,6094.0,6094.0,6094.0,6094.0,6094.0
mean,2015.556285,314.028224,1989.488184,34.113257,4714.525271,3317.070518,38.647829
std,3.856319,572.645302,62.995391,0.507744,8418.105318,65.225411,0.20445
min,2007.0,13.0,1682.821,31.75,399.0,3065.051,37.726
25%,2013.0,80.0,1950.08325,33.814,1323.0,3274.3585,38.522
50%,2016.0,137.0,1989.376,34.117,2195.0,3320.2095,38.647
75%,2019.0,325.75,2028.8195,34.41875,4873.5,3363.171,38.783
max,2021.0,9406.0,2269.811,36.378,137740.0,3498.659,39.289


In [619]:
births['year'].value_counts()

2020    563
2021    563
2016    551
2017    551
2018    551
2019    551
2015    493
2014    487
2013    380
2012    341
2011    308
2010    228
2009    183
2008    179
2007    165
Name: year, dtype: int64

In [602]:
set(births[births['year'] == 2007]['county'])

{'Adams',
 'Alameda',
 'Albany',
 'Allen',
 'Arapahoe',
 'Ashtabula',
 'Bell',
 'Bexar',
 'Black Hawk',
 'Boulder',
 'Brazoria',
 'Brazos',
 'Broome',
 'Butler',
 'Butte',
 'Cameron',
 'Cass',
 'Chautauqua',
 'Chittenden',
 'Clark',
 'Clermont',
 'Collin',
 'Columbiana',
 'Contra Costa',
 'Cuyahoga',
 'Dallas',
 'Delaware',
 'Denton',
 'Denver',
 'Douglas',
 'Dutchess',
 'Ector',
 'El Dorado',
 'El Paso',
 'Elkhart',
 'Ellis',
 'Erie',
 'Fairfield',
 'Fort Bend',
 'Franklin',
 'Fresno',
 'Galveston',
 'Grayson',
 'Greene',
 'Gregg',
 'Hamilton',
 'Harris',
 'Hendricks',
 'Hidalgo',
 'Hillsborough',
 'Humboldt',
 'Imperial',
 'Jefferson',
 'Johnson',
 'Kent',
 'Kern',
 'Kings',
 'La Porte',
 'Lake',
 'Larimer',
 'Licking',
 'Linn',
 'Lorain',
 'Los Angeles',
 'Lubbock',
 'Lucas',
 'Madera',
 'Madison',
 'Mahoning',
 'Marin',
 'Marion',
 'Mclennan',
 'Medina',
 'Merced',
 'Merrimack',
 'Mesa',
 'Midland',
 'Minnehaha',
 'Monroe',
 'Monterey',
 'Montgomery',
 'Napa',
 'Nassau',
 'New Cast

# PARKING LOT

In [603]:
# how many counties from the aqi are also in the cdc data
# len(set(aqi['county']).intersection(set(cdc['County'])))

In [604]:
# what are the differences
#set(aqi['county']).difference(set(cdc['County']))

### EPA API Information

Look to see if getting more granular data from the API is feasible

Your user ID is your email address: mrhurless@gmail.com 
Your key is: rubymouse94 

In [605]:
# import requests

In [606]:
# email = 'mrhurless@gmail.com'
# key = 'rubymouse94'

# url = f'https://aqs.epa.gov/data/api/list/parametersByClass?email={email}&key={key}&pc=CRITERIA'

# res = requests.get(url)

# res.status_code

In [607]:
# # from https://stackoverflow.com/questions/71603314/ssl-error-unsafe-legacy-renegotiation-disabled

# import urllib3
# import ssl

# class CustomHttpAdapter (requests.adapters.HTTPAdapter):
#     # "Transport adapter" that allows us to use custom ssl_context.

#     def __init__(self, ssl_context=None, **kwargs):
#         self.ssl_context = ssl_context
#         super().__init__(**kwargs)

#     def init_poolmanager(self, connections, maxsize, block=False):
#         self.poolmanager = urllib3.poolmanager.PoolManager(
#             num_pools=connections, maxsize=maxsize,
#             block=block, ssl_context=self.ssl_context)


# def get_legacy_session():
#     ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
#     ctx.options |= 0x4  # OP_LEGACY_SERVER_CONNECT
#     session = requests.session()
#     session.mount('https://', CustomHttpAdapter(ctx))
#     return session

In [608]:
# res = get_legacy_session().get(url)

In [609]:
def get_aqi():
    creds = ('mhurless@me.com', '0n3L0v3#')
    email = 'mrhurless@gmail.com'
    key = 'rubymouse94'
    all_posts = []
    
    url = "https://support.brightsign.biz/api/v2/community/posts"
    
    res = requests.get(url, auth=creds)
    
    # get data other than posts data
    metadata = {
                key: value for key, value in res.json().items() 
                if key != 'posts'
            }
    
    last_page = metadata['page_count']
    current_page = metadata['page']
    
    #count = 0 #keep track of posts # don't think this is needed with this
    # approach

    # goal is to get posts from all pages, so we'll check if we've reached 
    # the last page, but will also put in a catch for the rate limit as 
    # mentioned by the API docs
    
    while current_page <= last_page:
        res = requests.get(url, auth=creds)

        if res.status_code == 200:
            posts = pd.DataFrame(res.json()['posts'])
            metadata = {
                key: value for key, value in res.json().items() 
                if key != 'posts'
            }
            
            # update current page
            current_page = metadata['page']
            
            # update URL to pull posts from next page
            url = metadata['next_page']

            all_posts.append(posts)
            
            #sleep(5)

            if metadata['page'] == metadata['page_count']: 
                break #break loop if last page is reached

            #get sequential posts from most recent to least    
            #params['before'] = posts['created_utc'].min()
        elif res.status_code == 429:
            sleep(res.headers['retry-after'])
        else:
            print(f'status: {res.status_code}')  
    print(f'posts retrieved from API: {len(all_posts)}')

    return pd.concat(all_posts)