### Preprocess data and try different feature selection techniques
* Preprocess data with a preprocessing package from sklearn
* preprocess with pca?

### Import Packages

In [54]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import klib
import plotly.figure_factory as ff
from IPython.core.display import display
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')
#np.set_printoptions(threshold=sys.maxsize)

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

In [55]:
# define useful function to create scatterplots of target feature against desired columns
def scatterplots(df, target, columns, ncol=None, figsize=(20, 25)):
    if ncol is None:
        ncol = len(columns)
    nrow = int(np.ceil(len(columns) / ncol))
    fig, axes = plt.subplots(nrow, ncol, figsize=figsize, squeeze=False)
    fig.subplots_adjust(wspace=0.7, hspace=0.7)
    for i, col in enumerate(columns):
        # ax = axes.flatten()[i]
        # ax.scatter(x = col, y = target, data=df, alpha=0.5)
        sns.regplot(ax=axes.flatten()[i], x=col, y=target, data=df, scatter_kws={'alpha':0.5})
        # ax.set(xlabel=col, ylabel=target)
    nsubplots = nrow * ncol
    for empty in range(i+1, nsubplots):
        axes.flatten()[empty].set_visible(False)

### Load data

In [56]:
pm10_2017_2019 = r'../../data/processed/pm10_2017_2019'
pm10_year_diff = r'../../data/processed/pm10_year_diff'
pm10_year_pct_change = r'../../data/processed/pm10_year_pct_change'
pm10_year_diff_top_corr =  r'../../data/processed/pm10_year_diff_top_corr'
pm10_year_pct_change_top_corr = r'../../data/processed/pm10_year_pct_change_top_corr'

df_norm = pd.read_csv(pm10_2017_2019)
df_year_diff = pd.read_csv(pm10_year_diff)
df_year_diff_top = pd.read_csv(pm10_year_diff_top_corr)
df_year_pct_change = pd.read_csv(pm10_year_pct_change)
df_year_pct_change_top = pd.read_csv(pm10_year_pct_change_top_corr)

In [57]:
# Work with top correlated variables with our target
# this reduces number of columns from 1000

df = pd.merge(df_year_diff_top, df_year_pct_change_top)
df

Unnamed: 0,AQI_2019,HAA_MALE_ratio_2018_2019_diff,HAAC_MALE_ratio_2017_2019_diff,HAA_MALE_ratio_2017_2019_diff,HAA_MALE_2018_2019_diff,HIA_FEMALE_2018_2019_diff,HIAC_FEMALE_2018_2019_diff,HIA_MALE_2018_2019_diff,HIAC_MALE_2018_2019_diff,HAAC_MALE_2018_2019_diff,...,HWAC_FEMALE_ratio_2017_2019_pct_change,HWA_FEMALE_ratio_2017_2018_pct_change,H_FEMALE_ratio_2017_2019_pct_change,NH_MALE_ratio_2017_2019_pct_change,HWAC_FEMALE_ratio_2017_2018_pct_change,H_FEMALE_ratio_2017_2018_pct_change,NH_MALE_ratio_2018_2019_pct_change,NH_FEMALE_ratio_2018_2019_pct_change,NH_FEMALE_ratio_2017_2018_pct_change,NH_FEMALE_ratio_2017_2019_pct_change
0,20.073389,0.001227,0.004126,0.002594,8,14,24,-10,20,15,...,0.066156,0.032477,0.069673,-0.002724,0.033374,0.036809,-0.001246,-0.001100,-0.000907,-0.002005
1,12.958515,-0.001258,-0.004820,-0.001981,-4,8,24,-7,4,-9,...,0.067250,0.035598,0.065508,-0.001108,0.036139,0.036086,-0.000012,-0.002554,-0.001858,-0.004407
2,13.125000,0.000885,-0.002493,0.000067,2,-4,-12,5,1,-10,...,0.056305,0.019292,0.055696,-0.005313,0.023307,0.015763,-0.004204,0.001529,0.000176,0.001705
3,11.247911,-0.000051,-0.001577,0.006876,0,-20,-15,13,28,-5,...,0.028091,0.013565,0.031366,-0.003912,0.014217,0.031752,0.001109,-0.003511,0.001533,-0.001983
4,17.413408,0.002806,0.006851,0.002255,3,3,-14,14,14,19,...,0.001100,-0.000633,0.001121,-0.004857,-0.000383,0.000351,-0.007802,0.002468,-0.001403,0.001062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,10.503937,0.002276,0.005661,0.002258,2,3,0,-4,2,6,...,0.064546,0.022456,0.066122,-0.003205,0.024745,0.026686,-0.001818,-0.000372,-0.001533,-0.001905
204,14.431990,-0.000792,0.001185,0.000715,-8,13,17,21,19,-2,...,0.046970,0.030851,0.045745,-0.007886,0.030850,0.029433,-0.003559,-0.002157,-0.005276,-0.007421
205,15.656425,-0.000568,0.004939,0.000109,-2,7,20,24,33,16,...,0.044822,0.021305,0.047402,-0.001170,0.022401,0.023545,-0.000766,-0.001552,-0.001512,-0.003062
206,13.354142,-0.005542,0.021831,0.012354,-5,-10,-5,2,6,-2,...,0.028606,0.010813,0.025964,-0.002544,0.008189,0.009373,-0.002465,-0.001158,-0.000593,-0.001750


In [58]:

X = df.drop('AQI_2019', axis=1)
y = df['AQI_2019']


In [59]:
# Try Univariate feature selection
# Having top correlated features vs using all features yielded same result
# when performing with univariate linear regression as score function

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

X.shape

(208, 160)

In [81]:
# returns the 20 best features
# X_new = SelectKBest(f_regression, k=20).fit_transform(X,y)
# X_new.shape
# X.columns[X_new.get_support()]
# Use f_regression as score function

selector = SelectKBest(f_regression, k=20)
selector.fit(X,y)
col_linear_reg = X.columns[selector.get_support()]
X.columns[selector.get_support()]

Index(['HAA_MALE_ratio_2018_2019_diff', 'HAAC_MALE_ratio_2017_2019_diff',
       'HAA_MALE_ratio_2017_2019_diff', 'HAA_MALE_2018_2019_diff',
       'HIA_FEMALE_2018_2019_diff', 'HIAC_FEMALE_2018_2019_diff',
       'HIA_MALE_2018_2019_diff', 'HIAC_MALE_2018_2019_diff',
       'HAAC_MALE_2018_2019_diff',
       'Personal income (thousands of dollars) 2018_2019_pct_change',
       'Per capita personal income (dollars) 2018_2019_pct_change',
       'HWA_FEMALE_ratio_2017_2018_pct_change',
       'H_FEMALE_ratio_2017_2019_pct_change',
       'NH_MALE_ratio_2017_2019_pct_change',
       'HWAC_FEMALE_ratio_2017_2018_pct_change',
       'H_FEMALE_ratio_2017_2018_pct_change',
       'NH_MALE_ratio_2018_2019_pct_change',
       'NH_FEMALE_ratio_2018_2019_pct_change',
       'NH_FEMALE_ratio_2017_2018_pct_change',
       'NH_FEMALE_ratio_2017_2019_pct_change'],
      dtype='object')

In [82]:
# Perform univariate features selection with mutual info regression as score function
selector = SelectKBest(mutual_info_regression, k=20)
selector.fit(X,y)
col_mutual_info = X.columns[selector.get_support()]
X.columns[selector.get_support()]

Index(['HAAC_MALE_2018_2019_diff', 'HIAC_MALE_2017_2019_diff',
       'HAAC_MALE_ratio_2018_2019_diff', 'HIAC_MALE_2017_2018_diff',
       'HWA_FEMALE_2017_2019_diff', 'IAC_FEMALE_2018_2019_diff',
       'HAAC_FEMALE_2017_2018_diff', 'TOM_MALE_2018_2019_diff',
       'Poverty Estimate, Age 5-17 in Families_2018_2019_diff',
       'Median Household Income_2017_2019_pct_change',
       'BACH_TOT_ratio_2017_2019_pct_change', 'Employed_2017_2018_pct_change',
       'HWAC_MALE_ratio_2018_2019_pct_change',
       'HWAC_MALE_ratio_2017_2018_pct_change',
       'H_MALE_ratio_2017_2018_pct_change',
       'NHWAC_FEMALE_ratio_2017_2019_pct_change',
       'NHWA_FEMALE_ratio_2017_2019_pct_change',
       'NHWA_FEMALE_ratio_2017_2018_pct_change',
       'HWA_FEMALE_ratio_2017_2018_pct_change',
       'NH_FEMALE_ratio_2017_2018_pct_change'],
      dtype='object')

In [87]:
display(set(col_mutual_info) - set(col_linear_reg))
display(set(col_linear_reg) - set(col_mutual_info))
# col_mutual_info

{'BACH_TOT_ratio_2017_2019_pct_change',
 'Employed_2017_2018_pct_change',
 'HAAC_FEMALE_2017_2018_diff',
 'HAAC_MALE_ratio_2018_2019_diff',
 'HIAC_MALE_2017_2018_diff',
 'HIAC_MALE_2017_2019_diff',
 'HWAC_MALE_ratio_2017_2018_pct_change',
 'HWAC_MALE_ratio_2018_2019_pct_change',
 'HWA_FEMALE_2017_2019_diff',
 'H_MALE_ratio_2017_2018_pct_change',
 'IAC_FEMALE_2018_2019_diff',
 'Median Household Income_2017_2019_pct_change',
 'NHWAC_FEMALE_ratio_2017_2019_pct_change',
 'NHWA_FEMALE_ratio_2017_2018_pct_change',
 'NHWA_FEMALE_ratio_2017_2019_pct_change',
 'Poverty Estimate, Age 5-17 in Families_2018_2019_diff',
 'TOM_MALE_2018_2019_diff'}

{'HAAC_MALE_ratio_2017_2019_diff',
 'HAA_MALE_2018_2019_diff',
 'HAA_MALE_ratio_2017_2019_diff',
 'HAA_MALE_ratio_2018_2019_diff',
 'HIAC_FEMALE_2018_2019_diff',
 'HIAC_MALE_2018_2019_diff',
 'HIA_FEMALE_2018_2019_diff',
 'HIA_MALE_2018_2019_diff',
 'HWAC_FEMALE_ratio_2017_2018_pct_change',
 'H_FEMALE_ratio_2017_2018_pct_change',
 'H_FEMALE_ratio_2017_2019_pct_change',
 'NH_FEMALE_ratio_2017_2019_pct_change',
 'NH_FEMALE_ratio_2018_2019_pct_change',
 'NH_MALE_ratio_2017_2019_pct_change',
 'NH_MALE_ratio_2018_2019_pct_change',
 'Per capita personal income (dollars) 2018_2019_pct_change',
 'Personal income (thousands of dollars) 2018_2019_pct_change'}

In [90]:
# Use Recursive Feature elemination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
rfe = RFE(estimator=lr, n_features_to_select=20, step=1)
rfe.fit(X,y)

RFE(estimator=LinearRegression(), n_features_to_select=20)

In [92]:
X.columns[rfe.get_support()]

Index(['HAAC_MALE_ratio_2017_2019_diff', 'HAAC_MALE_ratio_2018_2019_diff',
       'HAAC_MALE_ratio_2017_2018_diff', 'TOT_MALE_ratio_2017_2019_diff',
       'TOT_MALE_ratio_2017_2018_diff', 'TOT_FEMALE_ratio_2017_2018_diff',
       'TOT_FEMALE_ratio_2017_2019_diff',
       'TOT_MALE_ratio_2017_2019_pct_change',
       'HWA_FEMALE_2017_2018_pct_change', 'HWAC_FEMALE_2017_2018_pct_change',
       'TOT_FEMALE_ratio_2017_2019_pct_change',
       'NHWA_FEMALE_ratio_2018_2019_pct_change',
       'NHWA_FEMALE_ratio_2017_2019_pct_change',
       'NHWA_FEMALE_ratio_2017_2018_pct_change',
       'NHWAC_FEMALE_ratio_2017_2018_pct_change',
       'HWA_FEMALE_ratio_2017_2018_pct_change',
       'HWAC_FEMALE_ratio_2017_2018_pct_change',
       'NH_FEMALE_ratio_2018_2019_pct_change',
       'NH_FEMALE_ratio_2017_2018_pct_change',
       'NH_FEMALE_ratio_2017_2019_pct_change'],
      dtype='object')