In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import Column, Integer, Float, Date, String, VARCHAR
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine

In [2]:
# Define paths
# Data from https://aqs.epa.gov/aqsweb/airdata/download_files.html
path15 = '../01_Resources/EPA_AQI_data/daily_aqi_by_county_2015.csv'
path16 = '../01_Resources/EPA_AQI_data/daily_aqi_by_county_2016.csv'
path17 = '../01_Resources/EPA_AQI_data/daily_aqi_by_county_2017.csv'
path18 = '../01_Resources/EPA_AQI_data/daily_aqi_by_county_2018.csv'
path19 = '../01_Resources/EPA_AQI_data/daily_aqi_by_county_2019.csv'
path20 = '../01_Resources/EPA_AQI_data/daily_aqi_by_county_2020.csv'

In [51]:
# Read in data for six year of AQI data
df15 = pd.DataFrame(pd.read_csv(path15))
df16 = pd.DataFrame(pd.read_csv(path16))
df17 = pd.DataFrame(pd.read_csv(path17))
df18 = pd.DataFrame(pd.read_csv(path18))
df19 = pd.DataFrame(pd.read_csv(path19))
df20 = pd.DataFrame(pd.read_csv(path20))

df19

Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,Alabama,Baldwin,1,3,2019-01-03,18,Good,PM2.5,01-003-0010,1
1,Alabama,Baldwin,1,3,2019-01-06,35,Good,PM2.5,01-003-0010,1
2,Alabama,Baldwin,1,3,2019-01-09,14,Good,PM2.5,01-003-0010,1
3,Alabama,Baldwin,1,3,2019-01-12,36,Good,PM2.5,01-003-0010,1
4,Alabama,Baldwin,1,3,2019-01-15,38,Good,PM2.5,01-003-0010,1
...,...,...,...,...,...,...,...,...,...,...
340164,Wyoming,Weston,56,45,2019-12-27,36,Good,Ozone,56-045-0003,2
340165,Wyoming,Weston,56,45,2019-12-28,37,Good,Ozone,56-045-0003,2
340166,Wyoming,Weston,56,45,2019-12-29,34,Good,Ozone,56-045-0003,2
340167,Wyoming,Weston,56,45,2019-12-30,36,Good,Ozone,56-045-0003,2


In [53]:
# Append all six years into one DF
five_years_df = df15.append([df16,df17,df18,df19], ignore_index=True)

In [54]:
# Split "Date" column into "Year", "Month", and "Day" columns
five_years_df['Year'] = [d.split('-')[0] for d in five_years_df.Date]
five_years_df['Month'] = [d.split('-')[1] for d in five_years_df.Date]
five_years_df['Day'] = [d.split('-')[2] for d in five_years_df.Date]

# Create column to hold Month and Day w/o year
# This is to compare specific days across all six years
# TODO: Find a cleaner, more effective way to do this
five_years_df['Month_Day'] = five_years_df['Month'] + '-' + five_years_df['Day']

# Change type from str to int64
five_years_df = five_years_df.astype({'Year': 'int64','Month': 'int64','Day': 'int64'})

# Drop unneeded columns
five_years_df = five_years_df.drop(columns={'State Code','Defining Site'})

# Limit DF to days before June
before_june_df = five_years_df.loc[five_years_df['Month']<6]

# Check result
before_june_df

Unnamed: 0,State Name,county Name,County Code,Date,AQI,Category,Defining Parameter,Number of Sites Reporting,Year,Month,Day,Month_Day
0,Alabama,Baldwin,3,2015-01-03,28,Good,PM2.5,1,2015,1,3,01-03
1,Alabama,Baldwin,3,2015-01-06,48,Good,PM2.5,1,2015,1,6,01-06
2,Alabama,Baldwin,3,2015-01-09,55,Moderate,PM2.5,1,2015,1,9,01-09
3,Alabama,Baldwin,3,2015-01-12,41,Good,PM2.5,1,2015,1,12,01-12
4,Alabama,Baldwin,3,2015-01-15,26,Good,PM2.5,1,2015,1,15,01-15
...,...,...,...,...,...,...,...,...,...,...,...,...
1683977,Wyoming,Weston,45,2019-05-27,46,Good,Ozone,2,2019,5,27,05-27
1683978,Wyoming,Weston,45,2019-05-28,43,Good,Ozone,2,2019,5,28,05-28
1683979,Wyoming,Weston,45,2019-05-29,71,Moderate,Ozone,2,2019,5,29,05-29
1683980,Wyoming,Weston,45,2019-05-30,61,Moderate,Ozone,2,2019,5,30,05-30


In [55]:
# Split "Date" column into "Year", "Month", and "Day" columns
df20['Year'] = [d.split('-')[0] for d in df20.Date]
df20['Month'] = [d.split('-')[1] for d in df20.Date]
df20['Day'] = [d.split('-')[2] for d in df20.Date]

# Create column to hold Month and Day w/o year
# This is to compare specific days across all six years
# TODO: Find a cleaner, more effective way to do this
df20['Month_Day'] = df20['Month'] + '-' + df20['Day']

# Change type from str to int64
df20 = df20.astype({'Year': 'int64','Month': 'int64','Day': 'int64'})

# Drop unneeded columns
df20 = df20.drop(columns={'State Code','Defining Site'})

# Limit DF to days before June
df20 = df20.loc[df20['Month']<6]

# Check result
df20

Unnamed: 0,State Name,county Name,County Code,Date,AQI,Category,Defining Parameter,Number of Sites Reporting,Year,Month,Day,Month_Day
0,Alabama,Baldwin,3,2020-01-01,48,Good,PM2.5,1,2020,1,1,01-01
1,Alabama,Baldwin,3,2020-01-04,13,Good,PM2.5,1,2020,1,4,01-04
2,Alabama,Baldwin,3,2020-01-07,14,Good,PM2.5,1,2020,1,7,01-07
3,Alabama,Baldwin,3,2020-01-10,39,Good,PM2.5,1,2020,1,10,01-10
4,Alabama,Baldwin,3,2020-01-13,29,Good,PM2.5,1,2020,1,13,01-13
...,...,...,...,...,...,...,...,...,...,...,...,...
41245,Wyoming,Uinta,41,2020-03-27,5,Good,PM10,2,2020,3,27,03-27
41246,Wyoming,Uinta,41,2020-03-28,6,Good,PM10,2,2020,3,28,03-28
41247,Wyoming,Uinta,41,2020-03-29,6,Good,PM10,2,2020,3,29,03-29
41248,Wyoming,Uinta,41,2020-03-30,5,Good,PM10,2,2020,3,30,03-30


In [71]:
five_year_avg = before_june_df.groupby(['County Code','Month_Day'])['AQI'].mean().reset_index()
five_year_avg = five_year_avg.rename(columns={'AQI':'Five-Year Avg.'})
five_year_avg

Unnamed: 0,County Code,Month_Day,Five-Year Avg.
0,1,01-01,36.950413
1,1,01-02,38.042735
2,1,01-03,43.327731
3,1,01-04,40.271186
4,1,01-05,38.921739
...,...,...,...
25010,840,05-25,6.000000
25011,840,05-26,13.000000
25012,840,05-27,9.000000
25013,840,05-30,11.500000


In [64]:
avg_20 = df20.groupby(['County Code','Month_Day'], as_index=False).mean().to_frame()
avg_20



AttributeError: 'DataFrame' object has no attribute 'to_frame'

In [42]:
len(df20['County Code'].unique())
# len(df20['County Code'].unique())

143

In [45]:
compare_df = pd.merge(df20, five_year_avg,  how='left', 
                      left_on=['County Code','Month_Day'], 
                      right_on = ['County Code','Month_Day'])
compare_df                                                      

Unnamed: 0,County Code,Month_Day,AQI,Five-Year Avg.
0,1,01-01,19.200000,36.950413
1,1,01-02,23.928571,38.042735
2,1,01-03,24.857143,43.327731
3,1,01-04,24.866667,40.271186
4,1,01-05,18.000000,38.921739
...,...,...,...,...
11216,840,02-21,6.000000,
11217,840,02-27,2.000000,
11218,840,03-04,3.000000,6.000000
11219,840,03-10,6.000000,3.000000
