### Preprocess data and try different feature selection techniques
* Preprocess data with a preprocessing package from sklearn
* preprocess with pca?

### Import Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import klib
import plotly.figure_factory as ff
from IPython.core.display import display
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')
#np.set_printoptions(threshold=sys.maxsize)

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

In [2]:
# define useful function to create scatterplots of target feature against desired columns
def scatterplots(df, target, columns, ncol=None, figsize=(20, 25)):
    if ncol is None:
        ncol = len(columns)
    nrow = int(np.ceil(len(columns) / ncol))
    fig, axes = plt.subplots(nrow, ncol, figsize=figsize, squeeze=False)
    fig.subplots_adjust(wspace=0.7, hspace=0.7)
    for i, col in enumerate(columns):
        # ax = axes.flatten()[i]
        # ax.scatter(x = col, y = target, data=df, alpha=0.5)
        sns.regplot(ax=axes.flatten()[i], x=col, y=target, data=df, scatter_kws={'alpha':0.5})
        # ax.set(xlabel=col, ylabel=target)
    nsubplots = nrow * ncol
    for empty in range(i+1, nsubplots):
        axes.flatten()[empty].set_visible(False)

### Load data

In [3]:
pm10_2017_2019 = r'../../data/processed/pm10_2017_2019'
pm10_year_diff = r'../../data/processed/pm10_year_diff'
pm10_year_pct_change = r'../../data/processed/pm10_year_pct_change'
pm10_year_diff_top_corr =  r'../../data/processed/pm10_year_diff_top_corr'
pm10_year_pct_change_top_corr = r'../../data/processed/pm10_year_pct_change_top_corr'

df_norm = pd.read_csv(pm10_2017_2019)
df_year_diff = pd.read_csv(pm10_year_diff)
df_year_diff_top = pd.read_csv(pm10_year_diff_top_corr)
df_year_pct_change = pd.read_csv(pm10_year_pct_change)
df_year_pct_change_top = pd.read_csv(pm10_year_pct_change_top_corr)

In [4]:
# Work with top correlated variables with our target
# this reduces number of columns from 1000

df = pd.merge(df_year_diff, df_year_pct_change)
df

Unnamed: 0,AQI_2019,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",...,HWAC_MALE_ratio_2018_2019_pct_change,HWAC_FEMALE_ratio_2018_2019_pct_change,HBAC_MALE_ratio_2018_2019_pct_change,HBAC_FEMALE_ratio_2018_2019_pct_change,HIAC_MALE_ratio_2018_2019_pct_change,HIAC_FEMALE_ratio_2018_2019_pct_change,HAAC_MALE_ratio_2018_2019_pct_change,HAAC_FEMALE_ratio_2018_2019_pct_change,HNAC_MALE_ratio_2018_2019_pct_change,HNAC_FEMALE_ratio_2018_2019_pct_change
0,20.073389,1.008865,4312,6021,-1709,-0.6,-4148.0,-4526.0,-3770.0,-0.6,...,0.026476,0.031723,0.032864,0.040322,0.054139,0.074790,0.122425,-0.041309,0.055424,0.023801
1,12.958515,0.689292,5003,5641,-638,-0.5,-2312.0,-2147.0,-2477.0,-0.9,...,0.022018,0.030026,0.037618,0.016357,-0.005556,0.051458,-0.087266,0.243875,0.072299,-0.088200
2,13.125000,0.292164,-202,227,-429,-0.4,-2748.0,-2585.0,-2911.0,-1.2,...,0.025343,0.032246,-0.001476,0.063161,0.004428,-0.048117,-0.093386,0.020489,-0.010560,0.033049
3,11.247911,-1.114232,130,211,-81,-0.4,3009.0,3302.0,2716.0,4.2,...,0.042465,0.013679,0.095179,-0.115560,0.036155,-0.021558,-0.158081,0.130874,0.496745,0.122559
4,17.413408,-2.926909,155,159,-4,0.0,-946.0,-599.0,-1293.0,-1.0,...,0.008941,0.001484,-0.001274,-0.048402,0.019675,-0.012253,0.074008,-0.012205,0.018867,-0.083067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,10.503937,1.567276,-466,-322,-144,-0.3,282.0,73.0,491.0,0.3,...,0.008593,0.038840,0.049928,0.096699,0.025571,-0.000068,0.352849,0.047547,0.142779,-0.166724
204,14.431990,1.676852,-6690,-4550,-2140,-0.4,-696.0,-987.0,-405.0,0.0,...,0.015727,0.015638,0.013946,0.019648,0.007658,0.006921,-0.001563,0.007946,0.016960,0.001314
205,15.656425,3.299519,-481,71,-552,-0.3,847.0,673.0,1021.0,0.2,...,0.020747,0.021930,0.045184,0.029480,0.100848,0.057137,0.117484,0.185540,0.058094,0.251201
206,13.354142,0.921309,-40,-75,35,0.1,-455.0,-555.0,-355.0,-0.6,...,0.006963,0.020251,-0.020994,0.047957,0.007984,-0.018559,-0.020887,0.018051,0.196858,0.397275


In [5]:

X = df.drop('AQI_2019', axis=1)
y = df['AQI_2019']


In [6]:
# Try Univariate feature selection
# Having top correlated features vs using all features yielded same result
# when performing with univariate linear regression as score function

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

X.shape

(208, 1124)

In [7]:
# returns the 20 best features
# X_new = SelectKBest(f_regression, k=20).fit_transform(X,y)
# X_new.shape
# X.columns[X_new.get_support()]
# Use f_regression as score function

selector = SelectKBest(f_regression, k=20)
selector.fit(X,y)
col_linear_reg = X.columns[selector.get_support()]
X.columns[selector.get_support()]

Index(['HAA_MALE_ratio_2017_2019_diff', 'HAAC_MALE_ratio_2017_2019_diff',
       'HIA_MALE_2018_2019_diff', 'HIA_FEMALE_2018_2019_diff',
       'HAA_MALE_2018_2019_diff', 'HIAC_MALE_2018_2019_diff',
       'HIAC_FEMALE_2018_2019_diff', 'HAAC_MALE_2018_2019_diff',
       'HAA_MALE_ratio_2018_2019_diff', 'NH_FEMALE_ratio_2017_2018_pct_change',
       'H_FEMALE_ratio_2017_2018_pct_change',
       'HWA_FEMALE_ratio_2017_2018_pct_change',
       'HWAC_FEMALE_ratio_2017_2018_pct_change',
       'NH_MALE_ratio_2017_2019_pct_change',
       'NH_FEMALE_ratio_2017_2019_pct_change',
       'H_FEMALE_ratio_2017_2019_pct_change',
       'Personal income (thousands of dollars) 2018_2019_pct_change',
       'Per capita personal income (dollars) 2018_2019_pct_change',
       'NH_MALE_ratio_2018_2019_pct_change',
       'NH_FEMALE_ratio_2018_2019_pct_change'],
      dtype='object')

In [8]:
# Perform univariate features selection with mutual info regression as score function
selector = SelectKBest(mutual_info_regression, k=20)
selector.fit(X,y)
col_mutual_info = X.columns[selector.get_support()]
X.columns[selector.get_support()]

Index(['HBA_MALE_2017_2018_diff', 'HIAC_MALE_2017_2018_diff',
       'HAAC_FEMALE_2017_2018_diff', 'HBAC_FEMALE_ratio_2017_2018_diff',
       'HIAC_MALE_2017_2019_diff', '90% CI LB 5-17 percent_2018_2019_diff',
       'AA_FEMALE_2018_2019_diff', 'NHTOM_MALE_2018_2019_diff',
       'HBA_FEMALE_2017_2018_pct_change',
       'HWAC_MALE_ratio_2017_2018_pct_change',
       'HIAC_FEMALE_ratio_2017_2018_pct_change',
       'NHWA_FEMALE_2017_2019_pct_change', 'NHWAC_FEMALE_2017_2019_pct_change',
       'HIAC_FEMALE_2017_2019_pct_change',
       'WA_FEMALE_ratio_2017_2019_pct_change',
       'WAC_FEMALE_ratio_2017_2019_pct_change',
       'HTOM_MALE_ratio_2017_2019_pct_change',
       'HIAC_FEMALE_ratio_2017_2019_pct_change',
       'NHWA_FEMALE_2018_2019_pct_change',
       'NHWAC_FEMALE_2018_2019_pct_change'],
      dtype='object')

In [9]:
display(set(col_mutual_info) - set(col_linear_reg))
display(set(col_linear_reg) - set(col_mutual_info))
# col_mutual_info

{'90% CI LB 5-17 percent_2018_2019_diff',
 'AA_FEMALE_2018_2019_diff',
 'HAAC_FEMALE_2017_2018_diff',
 'HBAC_FEMALE_ratio_2017_2018_diff',
 'HBA_FEMALE_2017_2018_pct_change',
 'HBA_MALE_2017_2018_diff',
 'HIAC_FEMALE_2017_2019_pct_change',
 'HIAC_FEMALE_ratio_2017_2018_pct_change',
 'HIAC_FEMALE_ratio_2017_2019_pct_change',
 'HIAC_MALE_2017_2018_diff',
 'HIAC_MALE_2017_2019_diff',
 'HTOM_MALE_ratio_2017_2019_pct_change',
 'HWAC_MALE_ratio_2017_2018_pct_change',
 'NHTOM_MALE_2018_2019_diff',
 'NHWAC_FEMALE_2017_2019_pct_change',
 'NHWAC_FEMALE_2018_2019_pct_change',
 'NHWA_FEMALE_2017_2019_pct_change',
 'NHWA_FEMALE_2018_2019_pct_change',
 'WAC_FEMALE_ratio_2017_2019_pct_change',
 'WA_FEMALE_ratio_2017_2019_pct_change'}

{'HAAC_MALE_2018_2019_diff',
 'HAAC_MALE_ratio_2017_2019_diff',
 'HAA_MALE_2018_2019_diff',
 'HAA_MALE_ratio_2017_2019_diff',
 'HAA_MALE_ratio_2018_2019_diff',
 'HIAC_FEMALE_2018_2019_diff',
 'HIAC_MALE_2018_2019_diff',
 'HIA_FEMALE_2018_2019_diff',
 'HIA_MALE_2018_2019_diff',
 'HWAC_FEMALE_ratio_2017_2018_pct_change',
 'HWA_FEMALE_ratio_2017_2018_pct_change',
 'H_FEMALE_ratio_2017_2018_pct_change',
 'H_FEMALE_ratio_2017_2019_pct_change',
 'NH_FEMALE_ratio_2017_2018_pct_change',
 'NH_FEMALE_ratio_2017_2019_pct_change',
 'NH_FEMALE_ratio_2018_2019_pct_change',
 'NH_MALE_ratio_2017_2019_pct_change',
 'NH_MALE_ratio_2018_2019_pct_change',
 'Per capita personal income (dollars) 2018_2019_pct_change',
 'Personal income (thousands of dollars) 2018_2019_pct_change'}

In [10]:
# Use Recursive Feature elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
rfe = RFE(estimator=lr, n_features_to_select=20, step=1)
rfe.fit(X,y)

RFE(estimator=LinearRegression(), n_features_to_select=20)

In [11]:
col_recursion = X.columns[rfe.get_support()]
col_recursion

Index(['TOT_MALE_ratio_2017_2018_diff', 'TOT_FEMALE_ratio_2017_2018_diff',
       'AAC_FEMALE_2017_2018_pct_change', 'NH_FEMALE_2017_2018_pct_change',
       'NHTOM_MALE_2017_2018_pct_change', 'H_MALE_2017_2018_pct_change',
       'AAC_FEMALE_ratio_2017_2018_pct_change',
       'NH_FEMALE_ratio_2017_2018_pct_change',
       'NHTOM_MALE_ratio_2017_2018_pct_change',
       'H_MALE_ratio_2017_2018_pct_change', 'WA_FEMALE_2017_2019_pct_change',
       'NHWAC_MALE_2017_2019_pct_change',
       'WA_FEMALE_ratio_2017_2019_pct_change',
       'NHWAC_MALE_ratio_2017_2019_pct_change',
       'AA_FEMALE_2018_2019_pct_change', 'NHWA_FEMALE_2018_2019_pct_change',
       'NHAAC_MALE_2018_2019_pct_change',
       'AA_FEMALE_ratio_2018_2019_pct_change',
       'NHWA_FEMALE_ratio_2018_2019_pct_change',
       'NHAAC_MALE_ratio_2018_2019_pct_change'],
      dtype='object')

In [12]:
print('linear regression')
display(X[col_linear_reg].corrwith(y))
print('mutual regression')
display(X[col_mutual_info].corrwith(y))
print('recursion feature elemination')
display(X[col_recursion].corrwith(y))


linear regression
mutual regression
recursion feature elemination


HAA_MALE_ratio_2017_2019_diff                                  0.349482
HAAC_MALE_ratio_2017_2019_diff                                 0.360129
HIA_MALE_2018_2019_diff                                        0.306801
HIA_FEMALE_2018_2019_diff                                      0.317993
HAA_MALE_2018_2019_diff                                        0.326487
HIAC_MALE_2018_2019_diff                                       0.304698
HIAC_FEMALE_2018_2019_diff                                     0.308301
HAAC_MALE_2018_2019_diff                                       0.301281
HAA_MALE_ratio_2018_2019_diff                                  0.405938
NH_FEMALE_ratio_2017_2018_pct_change                          -0.362980
H_FEMALE_ratio_2017_2018_pct_change                           -0.326595
HWA_FEMALE_ratio_2017_2018_pct_change                         -0.302902
HWAC_FEMALE_ratio_2017_2018_pct_change                        -0.313699
NH_MALE_ratio_2017_2019_pct_change                            -0

HBA_MALE_2017_2018_diff                   0.021845
HIAC_MALE_2017_2018_diff                  0.260613
HAAC_FEMALE_2017_2018_diff                0.219866
HBAC_FEMALE_ratio_2017_2018_diff         -0.192010
HIAC_MALE_2017_2019_diff                  0.289161
90% CI LB 5-17 percent_2018_2019_diff    -0.127121
AA_FEMALE_2018_2019_diff                  0.158503
NHTOM_MALE_2018_2019_diff                 0.196788
HBA_FEMALE_2017_2018_pct_change          -0.131239
HWAC_MALE_ratio_2017_2018_pct_change     -0.267418
HIAC_FEMALE_ratio_2017_2018_pct_change   -0.144256
NHWA_FEMALE_2017_2019_pct_change         -0.053969
NHWAC_FEMALE_2017_2019_pct_change        -0.042125
HIAC_FEMALE_2017_2019_pct_change         -0.090425
WA_FEMALE_ratio_2017_2019_pct_change      0.032084
WAC_FEMALE_ratio_2017_2019_pct_change     0.005344
HTOM_MALE_ratio_2017_2019_pct_change     -0.153596
HIAC_FEMALE_ratio_2017_2019_pct_change   -0.131568
NHWA_FEMALE_2018_2019_pct_change         -0.057692
NHWAC_FEMALE_2018_2019_pct_chan

TOT_MALE_ratio_2017_2018_diff             0.224817
TOT_FEMALE_ratio_2017_2018_diff          -0.224817
AAC_FEMALE_2017_2018_pct_change          -0.011042
NH_FEMALE_2017_2018_pct_change           -0.019967
NHTOM_MALE_2017_2018_pct_change           0.009076
H_MALE_2017_2018_pct_change              -0.177533
AAC_FEMALE_ratio_2017_2018_pct_change    -0.117910
NH_FEMALE_ratio_2017_2018_pct_change     -0.362980
NHTOM_MALE_ratio_2017_2018_pct_change    -0.096640
H_MALE_ratio_2017_2018_pct_change        -0.269960
WA_FEMALE_2017_2019_pct_change            0.157120
NHWAC_MALE_2017_2019_pct_change          -0.013271
WA_FEMALE_ratio_2017_2019_pct_change      0.032084
NHWAC_MALE_ratio_2017_2019_pct_change    -0.247025
AA_FEMALE_2018_2019_pct_change           -0.016159
NHWA_FEMALE_2018_2019_pct_change         -0.057692
NHAAC_MALE_2018_2019_pct_change          -0.092975
AA_FEMALE_ratio_2018_2019_pct_change     -0.075225
NHWA_FEMALE_ratio_2018_2019_pct_change   -0.266054
NHAAC_MALE_ratio_2018_2019_pct_

From using sklearn feature selection tools was suprised to see that the tools
chosen columns that did not contain very high correlation scores with the target