Importing libraries

In [199]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import klib

import openaq
from IPython.core.display import display

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

#Load data

In [200]:
# pd.set_option('max_columns', None)
# pd.set_option('max_rows', None)
#np.set_printoptions(threshold=sys.maxsize)


daily_aqi_by_county_2017 = pd.read_csv('../../data/raw/daily_aqi_by_county_2017.csv')
daily_aqi_by_county_2018 = pd.read_csv('../../data/raw/daily_aqi_by_county_2018.csv')
daily_aqi_by_county_2019 = pd.read_csv('../../data/raw/daily_aqi_by_county_2019.csv')

annual_income_by_county = pd.read_csv('../../data/raw/CAINC1__ALL_AREAS_1969_2019.csv', encoding='latin-1')

county_ethnicities = pd.read_csv('../../data/raw/county_cc-est2019-alldata.csv', encoding='latin-1')



#Clean annual personal income by county

Remove columns we do not need and rows not needed

In [201]:
def remove_col(df, col_name):
    '''Returns a dataframe with removed column or columns from old dataframe'''
    new_df = df.copy()

    if(type(col_name) != str and len(col_name) > 1):
        for index in col_name:
            new_df = new_df.drop(str(index), axis=1)
    else:
        new_df = new_df.drop(str(col_name), axis=1)
    return new_df

# used to remove years we do not need
county_income_2017_2019 = remove_col(annual_income_by_county, range(1969, 2017))

# used to remove last 4 rows as they are not index data; looks like extra info
county_income_2017_2019 = county_income_2017_2019[:len(county_income_2017_2019) - 4]

# county_income_2017_2019.info()
county_income_2017_2019

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,2017,2018,2019
0,"""00000""",United States,,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,16937582000,17839255000,18542262000
1,"""00000""",United States,,CAINC1,2.0,...,Population (persons) 1/,Number of persons,324985539,326687501,328239523
2,"""00000""",United States,,CAINC1,3.0,...,Per capita personal income (dollars) 2/,Dollars,52118,54606,56490
3,"""01000""",Alabama,5,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,199999756,208752683,216449038
4,"""01000""",Alabama,5,CAINC1,2.0,...,Population (persons) 1/,Number of persons,4874486,4887681,4903185
...,...,...,...,...,...,...,...,...,...,...,...
9589,"""97000""",Rocky Mountain,7,CAINC1,2.0,...,Population (persons) 1/,Number of persons,12062055,12233639,12399296
9590,"""97000""",Rocky Mountain,7,CAINC1,3.0,...,Per capita personal income (dollars) 2/,Dollars,49991,52936,54873
9591,"""98000""",Far West,8,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,3279057722,3472097346,3634009401
9592,"""98000""",Far West,8,CAINC1,2.0,...,Population (persons) 1/,Number of persons,56059482,56350416,56572426


change column types

In [202]:
# county_income_2017_2019['2017'].fillna(np.NaN, inplace=True)
# county_income_2017_2019['2017 personal income'] = county_income_2017_2019['2017'].astype(float)

# for i, item in enumerate(county_income_2017_2019['2017']):
#    try:
#       int(item)
#    except ValueError:
#        print('ERROR at index {}: {!r}'.format(i, item))
#        break

county_income_2017_2019['Year 2017 (thousands of dollars)'] = pd.to_numeric(county_income_2017_2019['2017'], downcast='float', errors='coerce')
county_income_2017_2019['Year 2018 (thousands of dollars)'] = pd.to_numeric(county_income_2017_2019['2018'], downcast='float', errors='coerce')
county_income_2017_2019['Year 2019 (thousands of dollars)'] = pd.to_numeric(county_income_2017_2019['2019'], downcast='float', errors='coerce')
county_income_2017_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9594 entries, 0 to 9593
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   GeoFIPS                           9594 non-null   object 
 1   GeoName                           9594 non-null   object 
 2   Region                            9594 non-null   object 
 3   TableName                         9594 non-null   object 
 4   LineCode                          9594 non-null   float64
 5   IndustryClassification            9594 non-null   object 
 6   Description                       9594 non-null   object 
 7   Unit                              9594 non-null   object 
 8   2017                              9594 non-null   object 
 9   2018                              9594 non-null   object 
 10  2019                              9594 non-null   object 
 11  Year 2017 (thousands of dollars)  9519 non-null   float32
 12  Year 2

drop old object columns for years

In [203]:
county_income_2017_2019 = remove_col(county_income_2017_2019, range(2017,2020))
county_income_2017_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9594 entries, 0 to 9593
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   GeoFIPS                           9594 non-null   object 
 1   GeoName                           9594 non-null   object 
 2   Region                            9594 non-null   object 
 3   TableName                         9594 non-null   object 
 4   LineCode                          9594 non-null   float64
 5   IndustryClassification            9594 non-null   object 
 6   Description                       9594 non-null   object 
 7   Unit                              9594 non-null   object 
 8   Year 2017 (thousands of dollars)  9519 non-null   float32
 9   Year 2018 (thousands of dollars)  9519 non-null   float32
 10  Year 2019 (thousands of dollars)  9519 non-null   float32
dtypes: float32(3), float64(1), object(7)
memory usage: 712.2+ KB


Need to deal with the 3 descriptions personal income, population, per capita personal income (dollars)

In [204]:
personal_income = county_income_2017_2019['Description'] == ('Personal income (thousands of dollars)')
county_personal_incomes_2017_2019 = county_income_2017_2019[personal_income]
county_personal_incomes_2017_2019.head()

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,Year 2017 (thousands of dollars),Year 2018 (thousands of dollars),Year 2019 (thousands of dollars)
0,"""00000""",United States,,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,16937580000.0,17839260000.0,18542260000.0
3,"""01000""",Alabama,5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,199999800.0,208752700.0,216449000.0
6,"""01001""","Autauga, AL",5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,2276561.0,2360366.0,2453617.0
9,"""01003""","Baldwin, AL",5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,9471242.0,10065970.0,10600260.0
12,"""01005""","Barbour, AL",5.0,CAINC1,1.0,...,Personal income (thousands of dollars),Thousands of dollars,838184.0,872189.0,882834.0


remove more columns
remove IndustryClassifaction as it only contains ... values and also not what we are interested in
remove TableName as only one value and not what we are interested in
remove Unit as that has the same value as Thousands of dollars
remove descripton as that is all personal income now

In [205]:
cols_to_remove = ['IndustryClassification', 'TableName', 'Unit', 'Description']
# county_personal_incomes_2017_2019 = remove_col(personal_income, cols_to_remove)
county_personal_incomes_2017_2019
county_personal_incomes_2017_2019 = remove_col(county_personal_incomes_2017_2019, cols_to_remove)
county_personal_incomes_2017_2019

Unnamed: 0,GeoFIPS,GeoName,Region,LineCode,Year 2017 (thousands of dollars),Year 2018 (thousands of dollars),Year 2019 (thousands of dollars)
0,"""00000""",United States,,1.0,1.693758e+10,1.783926e+10,1.854226e+10
3,"""01000""",Alabama,5,1.0,1.999998e+08,2.087527e+08,2.164490e+08
6,"""01001""","Autauga, AL",5,1.0,2.276561e+06,2.360366e+06,2.453617e+06
9,"""01003""","Baldwin, AL",5,1.0,9.471242e+06,1.006597e+07,1.060026e+07
12,"""01005""","Barbour, AL",5,1.0,8.381840e+05,8.721890e+05,8.828340e+05
...,...,...,...,...,...,...,...
9579,"""94000""",Plains,4,1.0,1.055029e+09,1.107630e+09,1.146515e+09
9582,"""95000""",Southeast,5,1.0,3.818755e+09,4.022276e+09,4.173677e+09
9585,"""96000""",Southwest,6,1.0,1.924648e+09,2.051027e+09,2.144764e+09
9588,"""97000""",Rocky Mountain,7,1.0,6.029942e+08,6.476007e+08,6.803901e+08


In [206]:
county_personal_incomes_2017_2019.info()
type(county_personal_incomes_2017_2019['GeoName'].values[1])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3198 entries, 0 to 9591
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   GeoFIPS                           3198 non-null   object 
 1   GeoName                           3198 non-null   object 
 2   Region                            3198 non-null   object 
 3   LineCode                          3198 non-null   float64
 4   Year 2017 (thousands of dollars)  3173 non-null   float32
 5   Year 2018 (thousands of dollars)  3173 non-null   float32
 6   Year 2019 (thousands of dollars)  3173 non-null   float32
dtypes: float32(3), float64(1), object(3)
memory usage: 162.4+ KB


str

# Clean AQI pollution

Combine the aqi pollution years together

In [207]:
# daily_aqi_by_county_2017.Category.unique()
# daily_aqi_by_county_2018.head()
county_daily_aqi_by_2017_2019 = pd.concat([daily_aqi_by_county_2017, daily_aqi_by_county_2018, daily_aqi_by_county_2019], axis=0)
# daily_aqi_by_county_2017
county_daily_aqi_by_2017_2019


Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,Alabama,Baldwin,1,3,2017-01-01,21,Good,PM2.5,01-003-0010,1
1,Alabama,Baldwin,1,3,2017-01-04,22,Good,PM2.5,01-003-0010,1
2,Alabama,Baldwin,1,3,2017-01-10,19,Good,PM2.5,01-003-0010,1
3,Alabama,Baldwin,1,3,2017-01-13,30,Good,PM2.5,01-003-0010,1
4,Alabama,Baldwin,1,3,2017-01-16,16,Good,PM2.5,01-003-0010,1
...,...,...,...,...,...,...,...,...,...,...
341940,Wyoming,Weston,56,45,2019-12-27,36,Good,Ozone,56-045-0003,2
341941,Wyoming,Weston,56,45,2019-12-28,37,Good,Ozone,56-045-0003,2
341942,Wyoming,Weston,56,45,2019-12-29,34,Good,Ozone,56-045-0003,2
341943,Wyoming,Weston,56,45,2019-12-30,36,Good,Ozone,56-045-0003,2


In [208]:
county_daily_aqi_by_2017_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1026379 entries, 0 to 341944
Data columns (total 10 columns):
 #   Column                     Non-Null Count    Dtype 
---  ------                     --------------    ----- 
 0   State Name                 1026379 non-null  object
 1   county Name                1026379 non-null  object
 2   State Code                 1026379 non-null  int64 
 3   County Code                1026379 non-null  int64 
 4   Date                       1026379 non-null  object
 5   AQI                        1026379 non-null  int64 
 6   Category                   1026379 non-null  object
 7   Defining Parameter         1026379 non-null  object
 8   Defining Site              1026379 non-null  object
 9   Number of Sites Reporting  1026379 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 86.1+ MB


# Clean demographic data
(clean up on years interested and age group interested in)
(check if column types are appropriate)
(remove sumlev col as it just indicates that we are using county level or equivalent)

In [209]:
year_2017_2019 = county_ethnicities['YEAR'] > 9
county_ethnicities_2017_2019 = county_ethnicities.loc[year_2017_2019]
county_ethnicities_2017_2019 = county_ethnicities_2017_2019.reset_index(drop=True)
county_ethnicities_2017_2019 = remove_col(county_ethnicities_2017_2019, 'SUMLEV')
county_ethnicities_2017_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179094 entries, 0 to 179093
Data columns (total 79 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   STATE         179094 non-null  int64 
 1   COUNTY        179094 non-null  int64 
 2   STNAME        179094 non-null  object
 3   CTYNAME       179094 non-null  object
 4   YEAR          179094 non-null  int64 
 5   AGEGRP        179094 non-null  int64 
 6   TOT_POP       179094 non-null  int64 
 7   TOT_MALE      179094 non-null  int64 
 8   TOT_FEMALE    179094 non-null  int64 
 9   WA_MALE       179094 non-null  int64 
 10  WA_FEMALE     179094 non-null  int64 
 11  BA_MALE       179094 non-null  int64 
 12  BA_FEMALE     179094 non-null  int64 
 13  IA_MALE       179094 non-null  int64 
 14  IA_FEMALE     179094 non-null  int64 
 15  AA_MALE       179094 non-null  int64 
 16  AA_FEMALE     179094 non-null  int64 
 17  NA_MALE       179094 non-null  int64 
 18  NA_FEMALE     179094 non

In [210]:
# reduce rows by having a reducing age group to totals
# maybe try binning age groups from 1-12 (0-59) and 13-19 (60+)
total = county_ethnicities_2017_2019.AGEGRP == 0
county_ethnicities_2017_2019 = county_ethnicities_2017_2019.loc[total]
county_ethnicities_2017_2019 = remove_col(county_ethnicities_2017_2019, 'AGEGRP')


Need to reduce ethnicity columns by combining similar or dropping irrelevant columns

In [211]:
# find way to bin for every number of rows
ethnicites = list(county_ethnicities_2017_2019.iloc[:, 20:29])
ethnicites.extend(list(county_ethnicities_2017_2019.iloc[:, 32:54]))
ethnicites.extend(list(county_ethnicities_2017_2019.iloc[:, 56:]))
county_ethnicities_2017_2019 = remove_col(county_ethnicities_2017_2019, ethnicites)

# can add columns

In [212]:
county_ethnicities_2017_2019

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,YEAR,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,WA_FEMALE,...,AA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE,NAC_FEMALE,NH_MALE,NH_FEMALE,H_MALE,H_FEMALE
0,1,1,Alabama,Autauga County,10,55390,27010,28380,20876,21705,...,385,30,26,561,481,55,26188,27643,822,737
19,1,1,Alabama,Autauga County,11,55533,27006,28527,20917,21749,...,374,33,28,535,494,58,26140,27771,866,756
38,1,1,Alabama,Autauga County,12,55869,27092,28777,20878,21729,...,370,32,26,538,507,49,26208,27990,884,787
57,1,3,Alabama,Baldwin County,10,212521,103218,109303,90360,95215,...,1361,69,69,1785,1913,168,98087,104753,5131,4550
76,1,3,Alabama,Baldwin County,11,217855,105702,112153,92613,97815,...,1396,70,72,1897,1984,176,100355,107363,5347,4790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178999,56,43,Wyoming,Washakie County,11,7877,3984,3893,3781,3670,...,37,5,3,86,90,8,3416,3352,568,541
179018,56,43,Wyoming,Washakie County,12,7805,3963,3842,3759,3618,...,39,4,2,87,89,11,3398,3299,565,543
179037,56,45,Wyoming,Weston County,10,6968,3660,3308,3447,3111,...,63,2,0,78,75,7,3501,3180,159,128
179056,56,45,Wyoming,Weston County,11,6924,3627,3297,3401,3073,...,74,2,0,83,84,3,3475,3176,152,121


# Combine the three separate dataframes

In [213]:
county_ethnicities_2017_2019
county_personal_incomes_2017_2019
county_daily_aqi_by_2017_2019

Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,Alabama,Baldwin,1,3,2017-01-01,21,Good,PM2.5,01-003-0010,1
1,Alabama,Baldwin,1,3,2017-01-04,22,Good,PM2.5,01-003-0010,1
2,Alabama,Baldwin,1,3,2017-01-10,19,Good,PM2.5,01-003-0010,1
3,Alabama,Baldwin,1,3,2017-01-13,30,Good,PM2.5,01-003-0010,1
4,Alabama,Baldwin,1,3,2017-01-16,16,Good,PM2.5,01-003-0010,1
...,...,...,...,...,...,...,...,...,...,...
341940,Wyoming,Weston,56,45,2019-12-27,36,Good,Ozone,56-045-0003,2
341941,Wyoming,Weston,56,45,2019-12-28,37,Good,Ozone,56-045-0003,2
341942,Wyoming,Weston,56,45,2019-12-29,34,Good,Ozone,56-045-0003,2
341943,Wyoming,Weston,56,45,2019-12-30,36,Good,Ozone,56-045-0003,2
