# Modeling with different parameters of train test split and k_best_feature size
The changes implemented from this run compared to Modeling_0 notebook were
that a 70-30 train test split was used instead of 75-25 and that 100 best feature were selected rather
than 25.

### goals
* develop a model that effectively predicts 2019 AQI pollution levels
* fit model with training dataset
* try grid search (b/c we have a small dataset (maybe experiemnt with random grid))


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn import preprocessing, svm
from sklearn.metrics import r2_score, accuracy_score

#Use to ignore convergence warnings
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import FitFailedWarning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)


# pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')
# np.set_printoptions(threshold=sys.maxsize)

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

In [2]:
# import X and y training and test sets

X = pd.read_csv('../../data/train_test/X_alt')

X_train = pd.read_csv('../../data/train_test/X_train_scaled_alt')

X_test = pd.read_csv('../../data/train_test/X_test_scaled_alt')

y = pd.read_csv('../../data/train_test/y_alt')

y_train = pd.read_csv('../../data/train_test/y_train_alt')

y_test = pd.read_csv('../../data/train_test/y_test_alt')


# Form a baseline for comparison


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
#

In [4]:
# # because we negative observation we cannot perform logarithmic scale and will instead perform standardization
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

In [5]:
# scale test data using the scaler fitted from the training set

X_test_scaled = scaler.transform(X_test)

In [6]:
# Changed scaled X's from numpy array to dataframe and assign scaled data to X_train and X_test

# retain X_train_scaled as a dataframe
X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
display(X_train)

# retain X_test_scale as a dataframe
X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
display(X_test)


Unnamed: 0,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",90% CI LB percent_2017_2018_diff,...,HWAC_MALE_ratio_2018,HWAC_FEMALE_ratio_2018,HBAC_MALE_ratio_2018,HBAC_FEMALE_ratio_2018,HIAC_MALE_ratio_2018,HIAC_FEMALE_ratio_2018,HAAC_MALE_ratio_2018,HAAC_FEMALE_ratio_2018,HNAC_MALE_ratio_2018,HNAC_FEMALE_ratio_2018
146,0.594218,-0.375958,-0.582788,0.435165,-1.312745,0.262910,0.282243,0.240843,1.490416,1.547001,...,-0.943217,-0.926925,-0.949138,-0.967574,-1.064088,-1.067930,-0.869256,-0.838112,-1.038778,-1.192414
108,-1.982133,-0.420560,-0.603975,0.397203,-0.130459,-0.076699,-0.092549,-0.059594,-1.050398,-1.126601,...,-0.815275,-0.782558,-0.838428,-0.830362,-0.885248,-0.914873,-0.791706,-0.826719,-1.073653,-1.102536
177,-0.843736,-0.451911,-0.515057,0.176570,-0.721602,0.402424,0.389249,0.413326,0.714056,0.611241,...,-0.673420,-0.689367,-0.424824,-0.445078,-0.802055,-0.877949,-0.679542,-0.685785,-0.766461,-0.802649
31,3.370154,1.013790,1.553493,-1.139770,-1.017173,0.516303,0.393026,0.641425,0.290587,0.210200,...,0.153800,0.157805,0.703216,0.758966,0.824383,0.938466,2.571984,2.675531,2.114922,2.225606
12,-0.653791,-0.031273,-0.319508,0.542562,1.347398,0.506806,0.549488,0.458608,1.137525,1.212801,...,-0.059175,-0.087728,-0.618701,-0.657432,-0.397147,-0.357200,-0.405688,-0.489052,-0.179028,-0.137092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.467927,1.493027,1.321300,0.127251,0.460684,-0.780538,-0.821000,-0.732712,-0.838663,-0.859240,...,-0.417258,-0.388750,0.651096,0.561106,-0.687137,-0.684148,-0.544081,-0.538042,-0.563176,-0.546624
14,0.391250,0.020607,0.319414,-0.560931,-0.426031,-1.218074,-1.059561,-1.375015,-0.768085,-0.658720,...,0.332519,0.342909,0.173178,0.189056,0.301678,0.314180,2.278455,2.274616,1.739947,1.654436
92,-1.318496,-0.493527,-0.695324,0.441005,0.460684,0.495095,0.479350,0.508023,1.702150,1.547001,...,-0.363758,-0.349930,0.049311,0.232117,-0.089629,-0.075910,-0.446907,-0.393218,-0.594221,-0.583362
179,1.224570,-0.335088,-0.577230,0.495839,0.756255,0.220770,0.222085,0.217826,0.431744,0.410721,...,-0.792804,-0.762465,-0.839308,-0.812844,0.971857,0.851106,-0.580799,-0.644174,-0.658215,-0.710151


Unnamed: 0,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",90% CI LB percent_2017_2018_diff,...,HWAC_MALE_ratio_2018,HWAC_FEMALE_ratio_2018,HBAC_MALE_ratio_2018,HBAC_FEMALE_ratio_2018,HIAC_MALE_ratio_2018,HIAC_FEMALE_ratio_2018,HAAC_MALE_ratio_2018,HAAC_FEMALE_ratio_2018,HNAC_MALE_ratio_2018,HNAC_FEMALE_ratio_2018
161,1.227380,-0.012052,0.177006,-0.351653,-1.017173,-0.032991,-0.042553,-0.022768,-0.062303,-0.124000,...,-0.318024,-0.344416,-0.406842,-0.423451,0.400127,0.413433,-0.517686,-0.491465,-0.602214,-0.473454
15,-0.484640,-0.168811,-0.332533,0.327769,-0.721602,0.135937,0.151767,0.118432,0.008275,0.009680,...,-0.316843,-0.298327,-0.748370,-0.794657,0.069791,-0.028561,0.106301,0.029162,-0.006590,0.010520
73,-0.408987,0.205545,0.611695,-0.785458,-1.017173,-0.924846,-0.707070,-1.145789,-0.768085,-0.591880,...,-0.644018,-0.644811,0.019953,0.053187,-0.861339,-0.857368,-0.651783,-0.615012,-0.905273,-0.879657
96,0.815192,-0.499313,-0.713559,0.465016,-0.130459,0.126624,0.132794,0.119277,-0.203460,-0.257680,...,-0.873626,-0.843974,-0.825571,-0.782748,-0.894092,-0.951227,-0.735885,-0.689069,-0.860816,-0.985739
166,-0.200375,-0.583664,-0.668406,0.233999,-1.608316,0.146634,0.144394,0.147931,0.220009,0.143360,...,-0.922825,-0.886868,-0.827679,-0.809962,-1.033983,-1.018763,-0.864755,-0.826767,-0.992890,-1.001058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,-0.030360,13.732587,12.350036,0.802455,1.051827,-3.091411,-2.971195,-3.195016,-0.697507,-0.658720,...,0.645356,0.647085,0.068192,0.059342,0.578675,0.606145,0.521103,0.548428,0.476733,0.485657
132,0.177150,-1.450694,-0.987259,-0.677737,-0.130459,-1.688344,-1.487675,-1.885983,-0.274038,-0.257680,...,1.042082,1.274237,15.526747,16.923427,2.673479,3.245066,1.010348,1.087883,4.242902,5.037475
56,0.951423,-0.226290,0.004034,-0.400971,-0.721602,0.610173,0.713145,0.498252,0.572900,0.611241,...,-0.102029,-0.063737,1.601931,1.766981,-0.508841,-0.461485,-0.396851,-0.391290,-0.183328,-0.137160
127,-0.853307,-0.945704,-0.920223,0.075013,-0.130459,0.683664,0.763681,0.595204,0.996369,1.079121,...,-0.230603,-0.203591,2.459165,2.554949,-0.384121,-0.328048,-0.325514,-0.311833,-0.062814,-0.010665


In [7]:
# used to test feature selection
select_k_best = SelectKBest(f_regression, k=100)
X_train_k_best = select_k_best.fit_transform(X_train, y_train)

best_k_cols = X_train.columns[select_k_best.get_support()]
best_k_cols

Index(['AQI_2017_2018_diff', 'TOT_MALE_ratio_2017_2018_diff',
       'TOT_FEMALE_ratio_2017_2018_diff', 'HBA_MALE_ratio_2017_2018_diff',
       'HBA_FEMALE_ratio_2017_2018_diff', 'HBAC_MALE_ratio_2017_2018_diff',
       'HBAC_FEMALE_ratio_2017_2018_diff', 'HAAC_MALE_ratio_2017_2018_diff',
       'HAAC_MALE_2017_2019_diff', 'TOT_MALE_ratio_2017_2019_diff',
       'TOT_FEMALE_ratio_2017_2019_diff', 'WA_MALE_ratio_2017_2019_diff',
       'WAC_MALE_ratio_2017_2019_diff', 'HAA_MALE_ratio_2017_2019_diff',
       'HBAC_FEMALE_ratio_2017_2019_diff', 'HAAC_MALE_ratio_2017_2019_diff',
       'HIA_FEMALE_2018_2019_diff', 'HAA_MALE_2018_2019_diff',
       'HIAC_FEMALE_2018_2019_diff', 'HAAC_MALE_2018_2019_diff',
       'TOT_MALE_ratio_2018_2019_diff', 'TOT_FEMALE_ratio_2018_2019_diff',
       'WAC_MALE_ratio_2018_2019_diff', 'HAA_MALE_ratio_2018_2019_diff',
       'HAAC_MALE_ratio_2018_2019_diff', 'NHIA_FEMALE_2017_2018_pct_change',
       'NHIAC_FEMALE_2017_2018_pct_change', 'H_FEMALE_2017_2018_p

## Develop baseline with dummy regressor
* run dummy regressor with strategy mean and median


In [8]:
# with mean
dummy = DummyRegressor(strategy='mean')

dummy.fit(X_train_k_best, y_train)

dummy.score(X_test[best_k_cols], y_test)


-0.019724974058809464

In [9]:
# with median
dummy = DummyRegressor(strategy='median')

dummy.fit(X_train_k_best, y_train)

dummy.score(X_test[best_k_cols], y_test)

-0.0028149158026582466

In [10]:
param_grid = {'alpha':np.arange(0.1, 1.1, step=0.00000001),
              'selection':['random', 'cyclic'],
              'positive':[True, False],
              'fit_intercept':[True,False], 'random_state':[42],
              'normalize':[True,False], 'warm_start':[True,False]}
lasso = Lasso()
lasso_grid = RandomizedSearchCV(lasso, param_grid, cv=5, random_state=42)
lasso_grid.fit(X_train_k_best, y_train)
print('best score: ', lasso_grid.best_score_)
print('best estimator', lasso_grid.best_estimator_)
tmp = lasso_grid.best_score_

best score:  0.3202235253240092
best estimator Lasso(alpha=0.3462019998704101, random_state=42, warm_start=True)


In [11]:
lasso_best = lasso_grid.best_estimator_
lasso_best.fit(X_train_k_best, y_train)

# this retrieves correlation of determination value
print('R^2', lasso_best.score(X_test[best_k_cols], y_test))

R^2 0.32940368979523404


### Evaluation

From these parameters, we get a test and training score that is relatively same of 0.32.
This means that our model is weak in generalizing our target.

In [None]:
rf_param_grid = {'n_estimators':np.arange(10,1000,step=100), 'criterion':['mse','mae'], 'max_features':['auot','sqrt','log2'],
                 'random_state':[42]}
rf = RandomForestRegressor()
rf_grid = RandomizedSearchCV(rf, rf_param_grid, cv=5, random_state=42)
rf_grid.fit(X_train_k_best, y_train)

print('best score', rf_grid.best_score_, 'best params', rf_grid.best_estimator_)


In [None]:
rf_best = rf_grid.best_estimator_
rf_best.fit(X_train_k_best, y_train)
rf_best.score(X_test[best_k_cols], y_test)

* Makes sense that Random Forest Regessor does a littlbe bit better than a individual regressor


* It appears that this is a more accurate score as train and test score are about the same
* 70-30 train_test_split ratio along with 100 columns appears to be suitable for model to be trained on
