Importing libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt

import openaq
from IPython.core.display import display

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

load data

In [2]:
# pd.set_option('max_columns', None)
# pd.set_option('max_rows', None)
#np.set_printoptions(threshold=sys.maxsize)


daily_aqi_by_county_2017 = pd.read_csv('../../data/raw/daily_aqi_by_county_2017.csv')
daily_aqi_by_county_2018 = pd.read_csv('../../data/raw/daily_aqi_by_county_2018.csv')
daily_aqi_by_county_2019 = pd.read_csv('../../data/raw/daily_aqi_by_county_2019.csv')

annual_income_by_county = pd.read_csv('../../data/raw/CAINC1__ALL_AREAS_1969_2019.csv', encoding='latin-1')

county_ethnicities = pd.read_csv('../../data/raw/county_cc-est2019-alldata.csv', encoding='latin-1')



Clean annual personal income by county

Remove columns we do not need and rows not needed

In [3]:
def remove_col(df, col_name):
    '''Returns a dataframe with removed column or columns from old dataframe'''
    new_df = df.copy()

    if(type(col_name) != str and len(col_name) > 1):
        for index in col_name:
            new_df = new_df.drop(str(index), axis=1)
    else:
        new_df = new_df.drop(str(col_name), axis=1)
    return new_df

# used to remove years we do not need
income_by_county_2017_2019 = remove_col(annual_income_by_county, range(1969, 2017))

# used to remove last 4 rows as they are not index data; looks like extra info
income_by_county_2017_2019 = income_by_county_2017_2019[:len(income_by_county_2017_2019) - 4]

# income_by_county_2017_2019.info()
income_by_county_2017_2019

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,2017,2018,2019
0,"""00000""",United States,,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,16937582000,17839255000,18542262000
1,"""00000""",United States,,CAINC1,2.0,...,Population (persons) 1/,Number of persons,324985539,326687501,328239523
2,"""00000""",United States,,CAINC1,3.0,...,Per capita personal income (dollars) 2/,Dollars,52118,54606,56490
3,"""01000""",Alabama,5,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,199999756,208752683,216449038
4,"""01000""",Alabama,5,CAINC1,2.0,...,Population (persons) 1/,Number of persons,4874486,4887681,4903185
...,...,...,...,...,...,...,...,...,...,...,...
9589,"""97000""",Rocky Mountain,7,CAINC1,2.0,...,Population (persons) 1/,Number of persons,12062055,12233639,12399296
9590,"""97000""",Rocky Mountain,7,CAINC1,3.0,...,Per capita personal income (dollars) 2/,Dollars,49991,52936,54873
9591,"""98000""",Far West,8,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,3279057722,3472097346,3634009401
9592,"""98000""",Far West,8,CAINC1,2.0,...,Population (persons) 1/,Number of persons,56059482,56350416,56572426


change column types

In [4]:
# income_by_county_2017_2019['2017'].fillna(np.NaN, inplace=True)
# income_by_county_2017_2019['2017 personal income'] = income_by_county_2017_2019['2017'].astype(float)

# for i, item in enumerate(income_by_county_2017_2019['2017']):
#    try:
#       int(item)
#    except ValueError:
#        print('ERROR at index {}: {!r}'.format(i, item))
#        break

income_by_county_2017_2019['Year 2017 (thousands of dollars)'] = pd.to_numeric(income_by_county_2017_2019['2017'], downcast='float', errors='coerce')
income_by_county_2017_2019['Year 2018 (thousands of dollars)'] = pd.to_numeric(income_by_county_2017_2019['2018'], downcast='float', errors='coerce')
income_by_county_2017_2019['Year 2019 (thousands of dollars)'] = pd.to_numeric(income_by_county_2017_2019['2019'], downcast='float', errors='coerce')
income_by_county_2017_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9594 entries, 0 to 9593
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   GeoFIPS                           9594 non-null   object 
 1   GeoName                           9594 non-null   object 
 2   Region                            9594 non-null   object 
 3   TableName                         9594 non-null   object 
 4   LineCode                          9594 non-null   float64
 5   IndustryClassification            9594 non-null   object 
 6   Description                       9594 non-null   object 
 7   Unit                              9594 non-null   object 
 8   2017                              9594 non-null   object 
 9   2018                              9594 non-null   object 
 10  2019                              9594 non-null   object 
 11  Year 2017 (thousands of dollars)  9519 non-null   float32
 12  Year 2

drop old object columns for years

In [5]:
income_by_county_2017_2019 = remove_col(income_by_county_2017_2019, range(2017,2020))
income_by_county_2017_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9594 entries, 0 to 9593
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   GeoFIPS                           9594 non-null   object 
 1   GeoName                           9594 non-null   object 
 2   Region                            9594 non-null   object 
 3   TableName                         9594 non-null   object 
 4   LineCode                          9594 non-null   float64
 5   IndustryClassification            9594 non-null   object 
 6   Description                       9594 non-null   object 
 7   Unit                              9594 non-null   object 
 8   Year 2017 (thousands of dollars)  9519 non-null   float32
 9   Year 2018 (thousands of dollars)  9519 non-null   float32
 10  Year 2019 (thousands of dollars)  9519 non-null   float32
dtypes: float32(3), float64(1), object(7)
memory usage: 712.2+ KB


need to deal with the 3 descriptions personal income, population, per capita personal income (dollars)

In [6]:
personal_income = income_by_county_2017_2019['Description'] == ('Personal income (thousands of dollars)')
personal_incomes_2017_2019 = income_by_county_2017_2019[personal_income]
personal_incomes_2017_2019.head()

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,Year 2017 (thousands of dollars),Year 2018 (thousands of dollars),Year 2019 (thousands of dollars)
0,"""00000""",United States,,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,16937580000.0,17839260000.0,18542260000.0
3,"""01000""",Alabama,5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,199999800.0,208752700.0,216449000.0
6,"""01001""","Autauga, AL",5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,2276561.0,2360366.0,2453617.0
9,"""01003""","Baldwin, AL",5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,9471242.0,10065970.0,10600260.0
12,"""01005""","Barbour, AL",5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,838184.0,872189.0,882834.0


remove more columns
remove IndustryClassifaction as it only contains ... values and also not what we are interested in
remove TableName as only one value and not what we are interested in
remove Unit as that has the same value as Thousands of dollars
remove descripton as that is all personal income now

In [7]:
cols_to_remove = ['IndustryClassification', 'TableName', 'Unit', 'Description']
# personal_incomes_2017_2019 = remove_col(personal_income, cols_to_remove)
personal_incomes_2017_2019
personal_incomes_2017_2019 = remove_col(personal_incomes_2017_2019, cols_to_remove)
personal_incomes_2017_2019

Unnamed: 0,GeoFIPS,GeoName,Region,LineCode,Year 2017 (thousands of dollars),Year 2018 (thousands of dollars),Year 2019 (thousands of dollars)
0,"""00000""",United States,,1.0,1.693758e+10,1.783926e+10,1.854226e+10
3,"""01000""",Alabama,5,1.0,1.999998e+08,2.087527e+08,2.164490e+08
6,"""01001""","Autauga, AL",5,1.0,2.276561e+06,2.360366e+06,2.453617e+06
9,"""01003""","Baldwin, AL",5,1.0,9.471242e+06,1.006597e+07,1.060026e+07
12,"""01005""","Barbour, AL",5,1.0,8.381840e+05,8.721890e+05,8.828340e+05
...,...,...,...,...,...,...,...
9579,"""94000""",Plains,4,1.0,1.055029e+09,1.107630e+09,1.146515e+09
9582,"""95000""",Southeast,5,1.0,3.818755e+09,4.022276e+09,4.173677e+09
9585,"""96000""",Southwest,6,1.0,1.924648e+09,2.051027e+09,2.144764e+09
9588,"""97000""",Rocky Mountain,7,1.0,6.029942e+08,6.476007e+08,6.803901e+08


Combine the aqi pollution years together

In [8]:
# daily_aqi_by_county_2017.Category.unique()
# daily_aqi_by_county_2018.head()
daily_aqi_by_county_2017_2019 = pd.concat([daily_aqi_by_county_2017, daily_aqi_by_county_2018, daily_aqi_by_county_2019], axis=0)
# daily_aqi_by_county_2017
daily_aqi_by_county_2017_2019

Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,Alabama,Baldwin,1,3,2017-01-01,21,Good,PM2.5,01-003-0010,1
1,Alabama,Baldwin,1,3,2017-01-04,22,Good,PM2.5,01-003-0010,1
2,Alabama,Baldwin,1,3,2017-01-10,19,Good,PM2.5,01-003-0010,1
3,Alabama,Baldwin,1,3,2017-01-13,30,Good,PM2.5,01-003-0010,1
4,Alabama,Baldwin,1,3,2017-01-16,16,Good,PM2.5,01-003-0010,1
...,...,...,...,...,...,...,...,...,...,...
341940,Wyoming,Weston,56,45,2019-12-27,36,Good,Ozone,56-045-0003,2
341941,Wyoming,Weston,56,45,2019-12-28,37,Good,Ozone,56-045-0003,2
341942,Wyoming,Weston,56,45,2019-12-29,34,Good,Ozone,56-045-0003,2
341943,Wyoming,Weston,56,45,2019-12-30,36,Good,Ozone,56-045-0003,2


merging and concat pollution data

In [17]:
# daily_aqi_by_county_2018_inner = pd.merge(left=daily_aqi_by_county_2017, right=daily_aqi_by_county_2018, left_on='Defining Site', right_on='Defining Site')
# daily_aqi_by_county_2018_inner.info()
# print('done')
# left = daily_aqi_by_county_2017[:1]
# display(left)
# right = daily_aqi_by_county_2018[:1]
# display(right)
# mergeleftright = pd.merge(left=left, right=right, left_on='Defining Parameter', right_on='Defining Parameter')
# mergeleftright