# Modeling with diffrent train test splits
Replication of Modeling_0_B notebook with changes in train test split.
Done with to see whether train_test splits influence variation in cross fold scores.
And whether influence test and train score; and reduce the difference between the two

### goals
* develop a model that effectively predicts 2019 AQI pollution levels
* fit model with training dataset
* try grid search (b/c we have a small dataset (maybe experiemnt with random grid))


In [282]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn import preprocessing, svm
from sklearn.metrics import r2_score, accuracy_score

#Use to ignore convergence warnings
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import FitFailedWarning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)


# pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')
# np.set_printoptions(threshold=sys.maxsize)

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

In [283]:
# import X and y training and test sets

X = pd.read_csv('../../data/train_test/X_alt')

X_train = pd.read_csv('../../data/train_test/X_train_scaled_alt')

X_test = pd.read_csv('../../data/train_test/X_test_scaled_alt')

y = pd.read_csv('../../data/train_test/y_alt')

y_train = pd.read_csv('../../data/train_test/y_train_alt')

y_test = pd.read_csv('../../data/train_test/y_test_alt')


# Form a baseline for comparison


In [284]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.26, random_state=42)
#

In [285]:
# # because we negative observation we cannot perform logarithmic scale and will instead perform standardization
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

In [286]:
# scale test data using the scaler fitted from the training set

X_test_scaled = scaler.transform(X_test)

In [287]:
# Changed scaled X's from numpy array to dataframe and assign scaled data to X_train and X_test

# retain X_train_scaled as a dataframe
X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
display(X_train)

# retain X_test_scale as a dataframe
X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
display(X_test)


Unnamed: 0,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",90% CI LB percent_2017_2018_diff,...,HWAC_MALE_ratio_2018,HWAC_FEMALE_ratio_2018,HBAC_MALE_ratio_2018,HBAC_FEMALE_ratio_2018,HIAC_MALE_ratio_2018,HIAC_FEMALE_ratio_2018,HAAC_MALE_ratio_2018,HAAC_FEMALE_ratio_2018,HNAC_MALE_ratio_2018,HNAC_FEMALE_ratio_2018
93,0.041247,-0.360395,-0.505481,0.402096,0.165609,0.161656,0.189973,0.131138,0.139731,0.265833,...,0.622268,0.482024,0.250466,0.176620,0.445016,0.304344,-0.173081,-0.141878,0.163032,0.115658
181,0.127410,0.158828,0.017878,0.370958,0.761803,-0.054578,-0.097440,-0.009706,-0.359108,-0.407552,...,-0.776256,-0.757272,-0.541163,-0.507081,-0.878638,-0.873588,-0.712099,-0.685977,-0.570077,-0.593322
158,-0.133746,-0.499969,-0.564168,0.191089,-1.622971,0.562055,0.606780,0.511722,1.707510,1.814617,...,-0.919701,-0.883861,-0.568475,-0.534282,-1.009983,-1.010890,-0.809347,-0.846821,-1.038934,-1.094741
5,-0.046201,9.203349,8.767154,0.811192,1.059899,-3.018258,-2.912993,-3.106537,-0.715422,-0.676905,...,0.657761,0.657361,-0.030127,-0.040108,0.574070,0.591544,0.540617,0.566915,0.445491,0.439417
132,0.162776,-1.017811,-0.741469,-0.699978,-0.132487,-1.639869,-1.450845,-1.824677,-0.287845,-0.272875,...,1.059462,1.291909,9.594300,9.899510,2.648222,3.177675,1.037281,1.114281,4.063447,4.716380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.486861,0.963858,0.904383,0.121857,0.463706,-0.748030,-0.793773,-0.695346,-0.857947,-0.878921,...,-0.418179,-0.390692,0.332787,0.255629,-0.679264,-0.672939,-0.540722,-0.535488,-0.553495,-0.530532
14,0.378389,-0.027353,0.190103,-0.580727,-0.430584,-1.177870,-1.028897,-1.324316,-0.786684,-0.676905,...,0.341001,0.349597,0.035237,0.036345,0.299803,0.305418,2.324619,2.318418,1.658992,1.537621
92,-1.343441,-0.373461,-0.533339,0.442178,0.463706,0.505166,0.487844,0.519633,1.707510,1.545263,...,-0.364008,-0.351413,-0.041882,0.061725,-0.087646,-0.076868,-0.442074,-0.388541,-0.583318,-0.565051
179,1.217600,-0.266802,-0.449146,0.498159,0.761803,0.235666,0.234286,0.235461,0.424782,0.400510,...,-0.798435,-0.768814,-0.595132,-0.554171,0.963375,0.831604,-0.577996,-0.643177,-0.644794,-0.684184


Unnamed: 0,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",90% CI LB percent_2017_2018_diff,...,HWAC_MALE_ratio_2018,HWAC_FEMALE_ratio_2018,HBAC_MALE_ratio_2018,HBAC_FEMALE_ratio_2018,HIAC_MALE_ratio_2018,HIAC_FEMALE_ratio_2018,HAAC_MALE_ratio_2018,HAAC_FEMALE_ratio_2018,HNAC_MALE_ratio_2018,HNAC_FEMALE_ratio_2018
161,1.220429,-0.049338,0.088576,-0.36707,-1.026778,-0.013632,-0.02654,-0.000139,-0.074057,-0.138198,...,-0.317701,-0.345835,-0.325881,-0.324664,0.397281,0.402686,-0.513927,-0.488228,-0.590996,-0.461779
15,-0.503692,-0.154867,-0.274693,0.326571,-0.728681,0.152325,0.164981,0.13813,-0.002795,-0.003521,...,-0.316505,-0.299202,-0.538514,-0.543451,0.070203,-0.030465,0.119523,0.040034,-0.018812,-0.00703
73,-0.427504,0.097145,0.398481,-0.809953,-1.026778,-0.8898,-0.681484,-1.099848,-0.786684,-0.609567,...,-0.647783,-0.649773,-0.06016,-0.043736,-0.851749,-0.842694,-0.650057,-0.613588,-0.882128,-0.843455
96,0.805328,-0.377355,-0.546339,0.46669,-0.132487,0.143176,0.146281,0.138958,-0.216583,-0.272875,...,-0.88027,-0.851285,-0.586579,-0.536432,-0.884179,-0.934675,-0.735434,-0.68873,-0.839421,-0.943131
166,-0.217417,-0.43414,-0.514148,0.230839,-1.622971,0.162834,0.157713,0.167016,0.210993,0.131156,...,-0.930086,-0.894685,-0.587892,-0.552472,-1.02269,-1.00086,-0.866258,-0.828448,-0.966297,-0.957525
9,1.043069,0.727048,0.488741,0.60946,1.059899,0.338212,0.228082,0.450177,0.068468,-0.138198,...,0.688088,0.437506,-0.086448,-0.106973,2.298973,1.773898,0.427043,0.342577,0.390044,0.324471
100,-0.670029,-0.378486,-0.552158,0.479278,0.761803,0.043801,0.015823,0.072537,-0.715422,-0.878921,...,-0.930348,-0.893475,-0.66188,-0.629773,-1.013503,-0.994013,-0.816451,-0.81097,-1.060418,-0.929494
135,0.141177,-0.628362,-0.369286,-0.66884,0.165609,-1.391205,-1.277138,-1.499935,-0.359108,-0.272875,...,0.024344,0.122122,3.266168,3.446515,0.45449,0.638011,0.44752,0.609006,0.837263,1.093099
18,2.382328,-0.421702,-0.530739,0.308021,-0.430584,0.27942,0.204331,0.355422,0.63857,0.198494,...,3.664147,3.607343,-0.010377,0.005538,1.490515,1.560876,2.45782,2.608265,1.331081,1.436079
148,-0.944063,-0.370948,-0.508824,0.383215,-1.324874,0.171258,0.195645,0.144753,0.139731,0.265833,...,-0.897288,-0.912868,-0.495627,-0.616895,-0.908252,-1.015007,-0.855402,-0.861802,-1.054488,-0.951682


In [288]:
# used to test feature selection
select_k_best = SelectKBest(f_regression, k=100)
X_train_k_best = select_k_best.fit_transform(X_train, y_train)

best_k_cols = X_train.columns[select_k_best.get_support()]
best_k_cols

Index(['AQI_2017_2018_diff', 'TOT_MALE_ratio_2017_2018_diff',
       'TOT_FEMALE_ratio_2017_2018_diff', 'HBA_MALE_ratio_2017_2018_diff',
       'HBA_FEMALE_ratio_2017_2018_diff', 'HBAC_MALE_ratio_2017_2018_diff',
       'HBAC_FEMALE_ratio_2017_2018_diff', 'HAA_MALE_2017_2019_diff',
       'HAAC_MALE_2017_2019_diff', 'TOT_MALE_ratio_2017_2019_diff',
       'TOT_FEMALE_ratio_2017_2019_diff', 'WA_MALE_ratio_2017_2019_diff',
       'WAC_MALE_ratio_2017_2019_diff', 'HAA_MALE_ratio_2017_2019_diff',
       'HBAC_FEMALE_ratio_2017_2019_diff', 'HAAC_MALE_ratio_2017_2019_diff',
       '90% CI LB 5-17 fam_2018_2019_diff', 'HIA_MALE_2018_2019_diff',
       'HIA_FEMALE_2018_2019_diff', 'HAA_MALE_2018_2019_diff',
       'HAA_FEMALE_2018_2019_diff', 'HIAC_MALE_2018_2019_diff',
       'HIAC_FEMALE_2018_2019_diff', 'HAAC_MALE_2018_2019_diff',
       'WAC_MALE_ratio_2018_2019_diff', 'HAA_MALE_ratio_2018_2019_diff',
       'HAAC_MALE_ratio_2018_2019_diff', 'NHIA_FEMALE_2017_2018_pct_change',
       'NHIA

## Develop baseline with dummy regressor
* run dummy regressor with strategy mean and median


In [289]:
# with mean
dummy = DummyRegressor(strategy='mean')

dummy.fit(X_train_k_best, y_train)

dummy.score(X_test[best_k_cols], y_test)


-0.014683484428373195

In [290]:
# with median
dummy = DummyRegressor(strategy='median')

dummy.fit(X_train_k_best, y_train)

dummy.score(X_test[best_k_cols], y_test)

-0.005731197263146637

In [291]:
param_grid = {'alpha':np.arange(0.1, 1.1, step=0.00000001),
              'selection':['random', 'cyclic'],
              'positive':[True, False],
              'fit_intercept':[True,False], 'random_state':[42],
              'normalize':[True,False], 'warm_start':[True,False]}
lasso = Lasso()
lasso_grid = RandomizedSearchCV(lasso, param_grid, cv=5, random_state=42)
lasso_grid.fit(X_train_k_best, y_train)
print('best score: ', lasso_grid.best_score_)
print('best estimator', lasso_grid.best_estimator_)
tmp = lasso_grid.best_score_

best score:  0.32284550703007386
best estimator Lasso(alpha=0.3462019998704101, random_state=42, warm_start=True)


In [292]:
lasso_best = lasso_grid.best_estimator_
lasso_best.fit(X_train_k_best, y_train)

# this retrieves correlation of determination value
print('R^2', lasso_best.score(X_test[best_k_cols], y_test))

R^2 0.43426603984571566


In [293]:
train_scores = []
for i in range(0,5):
    train_scores.append(lasso_grid.cv_results_['split{}_test_score'.format(i)][lasso_grid.best_index_])

print(np.std(train_scores), np.mean(train_scores), np.var(train_scores))
print('train scores', train_scores)

0.11292810111260876 0.32284550703007386 0.012752756020899586
train scores [0.5244933777872176, 0.24356422010782053, 0.36148184707155695, 0.2079735901386297, 0.27671450004514486]


### Evaluation

Variation among the cross fold train scores seem reasonable and model does not appear to overfit

In [294]:
rf_param_grid = {'n_estimators':np.arange(10,1000,step=1), 'criterion':['mse','mae'], 'max_features':['auot','sqrt','log2'],
                 'random_state':[42]}
rf = RandomForestRegressor()
rf_grid = RandomizedSearchCV(rf, rf_param_grid, cv=5, random_state=42)
rf_grid.fit(X_train_k_best, y_train)

print('best score', rf_grid.best_score_, 'best params', rf_grid.best_estimator_)


 0.37608502        nan 0.37728992 0.38191013]


best score 0.3819101309350703 best params RandomForestRegressor(criterion='mae', max_features='sqrt', n_estimators=476,
                      random_state=42)


In [295]:
rf_best = rf_grid.best_estimator_
rf_best.fit(X_train_k_best, y_train)
rf_best.score(X_test[best_k_cols], y_test)

0.415202244413048

* Makes sense that Random Forest Regessor does a littlbe bit better than a individual regressor



### save new X and y for 74-26 split


In [297]:
X_train_path = r'../../data/train_test/X_train_74_26'
X_train.to_csv(X_train_path, index=False)

X_test_scaled_path = r'../../data/train_test/X_test_74_26'
X_test.to_csv(X_test_scaled_path, index=False)

y_train_path = r'../../data/train_test/y_train_alt_74_26'
y_train.to_csv(y_train_path, index=False)

y_test_path = r'../../data/train_test/y_test_alt+74_26'
y_test.to_csv(y_test_path, index=False)

# Overall notes
* it appears that running with 76-24 split, produced results with acceptiable variation between split results from cross validation
and test result
