# Preprocess the data
* follows same methods as Preprocessing 1; however adds 2018 individual variables to X
* Saves new X as X_alt


### Import Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import klib
import plotly.figure_factory as ff
from IPython.core.display import display
from scipy import stats
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing

# pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')
#np.set_printoptions(threshold=sys.maxsize)

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

In [2]:
# define useful function to create scatterplots of target feature against desired columns
def scatterplots(df, target, columns, ncol=None, figsize=(20, 25)):
    if ncol is None:
        ncol = len(columns)
    nrow = int(np.ceil(len(columns) / ncol))
    fig, axes = plt.subplots(nrow, ncol, figsize=figsize, squeeze=False)
    fig.subplots_adjust(wspace=0.7, hspace=0.7)
    for i, col in enumerate(columns):
        # ax = axes.flatten()[i]
        # ax.scatter(x = col, y = target, data=df, alpha=0.5)
        sns.regplot(ax=axes.flatten()[i], x=col, y=target, data=df, scatter_kws={'alpha':0.5})
        # ax.set(xlabel=col, ylabel=target)
    nsubplots = nrow * ncol
    for empty in range(i+1, nsubplots):
        axes.flatten()[empty].set_visible(False)

### Load data

In [3]:
pm10_2017_2019 = r'../../data/processed/pm10_2017_2019'
pm10_year_diff = r'../../data/processed/pm10_year_diff'
pm10_year_pct_change = r'../../data/processed/pm10_year_pct_change'
pm10_year_diff_top_corr =  r'../../data/processed/pm10_year_diff_top_corr'
pm10_year_pct_change_top_corr = r'../../data/processed/pm10_year_pct_change_top_corr'

df_norm = pd.read_csv(pm10_2017_2019)
df_year_diff = pd.read_csv(pm10_year_diff)
df_year_diff_top = pd.read_csv(pm10_year_diff_top_corr)
df_year_pct_change = pd.read_csv(pm10_year_pct_change)
df_year_pct_change_top = pd.read_csv(pm10_year_pct_change_top_corr)

In [4]:
# concatenate all features
# after concentate look at features and see if need to standardize or normalize

display(df_year_pct_change)
display(df_year_diff)

Unnamed: 0,AQI_2019,AQI_2017_2018_pct_change,Civilian_labor_force_2017_2018_pct_change,Employed_2017_2018_pct_change,Unemployed_2017_2018_pct_change,Unemployment_rate_2017_2018_pct_change,"Poverty Estimate, All Ages_2017_2018_pct_change",90% CI LB All Ages_2017_2018_pct_change,90% CI UB All Ages_2017_2018_pct_change,"Poverty Percent, All Ages_2017_2018_pct_change",...,HWAC_MALE_ratio_2018_2019_pct_change,HWAC_FEMALE_ratio_2018_2019_pct_change,HBAC_MALE_ratio_2018_2019_pct_change,HBAC_FEMALE_ratio_2018_2019_pct_change,HIAC_MALE_ratio_2018_2019_pct_change,HIAC_FEMALE_ratio_2018_2019_pct_change,HAAC_MALE_ratio_2018_2019_pct_change,HAAC_FEMALE_ratio_2018_2019_pct_change,HNAC_MALE_ratio_2018_2019_pct_change,HNAC_FEMALE_ratio_2018_2019_pct_change
0,20.073389,0.061135,0.013844,0.020189,-0.128942,-0.139535,-0.038496,-0.045462,-0.032514,-0.035928,...,0.026476,0.031723,0.032864,0.040322,0.054139,0.074790,0.122425,-0.041309,0.055424,0.023801
1,12.958515,0.059050,0.028336,0.033239,-0.093125,-0.128205,-0.051402,-0.053703,-0.049561,-0.070313,...,0.022018,0.030026,0.037618,0.016357,-0.005556,0.051458,-0.087266,0.243875,0.072299,-0.088200
2,13.125000,0.021854,-0.001903,0.002235,-0.093261,-0.093023,-0.059333,-0.062900,-0.056488,-0.056338,...,0.025343,0.032246,-0.001476,0.063161,0.004428,-0.048117,-0.093386,0.020489,-0.010560,0.033049
3,11.247911,-0.074147,0.006385,0.011546,-0.038812,-0.039216,0.128447,0.155791,0.105858,0.126888,...,0.042465,0.013679,0.095179,-0.115560,0.036155,-0.021558,-0.158081,0.130874,0.496745,0.122559
4,17.413408,-0.138879,0.003142,0.003413,-0.001463,0.000000,-0.049222,-0.038434,-0.056579,-0.062112,...,0.008941,0.001484,-0.001274,-0.048402,0.019675,-0.012253,0.074008,-0.012205,0.018867,-0.083067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,10.503937,0.168176,-0.009622,-0.006847,-0.102491,-0.103448,0.040280,0.012364,0.060632,0.034884,...,0.008593,0.038840,0.049928,0.096699,0.025571,-0.000068,0.352849,0.047547,0.142779,-0.166724
204,14.431990,0.132224,-0.014080,-0.009978,-0.111942,-0.100000,-0.003911,-0.005839,-0.002167,0.000000,...,0.015727,0.015638,0.013946,0.019648,0.007658,0.006921,-0.001563,0.007946,0.016960,0.001314
205,15.656425,0.259583,-0.002127,0.000323,-0.084676,-0.103448,0.044369,0.040462,0.047385,0.041667,...,0.020747,0.021930,0.045184,0.029480,0.100848,0.057137,0.117484,0.185540,0.058094,0.251201
206,13.354142,0.088415,-0.000829,-0.001614,0.019909,0.027778,-0.048956,-0.071456,-0.032807,-0.061856,...,0.006963,0.020251,-0.020994,0.047957,0.007984,-0.018559,-0.020887,0.018051,0.196858,0.397275


Unnamed: 0,AQI_2019,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",...,HWAC_MALE_ratio_2018_2019_diff,HWAC_FEMALE_ratio_2018_2019_diff,HBAC_MALE_ratio_2018_2019_diff,HBAC_FEMALE_ratio_2018_2019_diff,HIAC_MALE_ratio_2018_2019_diff,HIAC_FEMALE_ratio_2018_2019_diff,HAAC_MALE_ratio_2018_2019_diff,HAAC_FEMALE_ratio_2018_2019_diff,HNAC_MALE_ratio_2018_2019_diff,HNAC_FEMALE_ratio_2018_2019_diff
0,20.073389,1.008865,4312,6021,-1709,-0.6,-4148.0,-4526.0,-3770.0,-0.6,...,0.049821,0.052091,0.005691,0.007374,0.003112,0.003709,0.002302,-0.000589,0.000933,0.000321
1,12.958515,0.689292,5003,5641,-638,-0.5,-2312.0,-2147.0,-2477.0,-0.9,...,0.050694,0.063381,0.009625,0.004769,-0.000580,0.004921,-0.002948,0.009569,0.001320,-0.001634
2,13.125000,0.292164,-202,227,-429,-0.4,-2748.0,-2585.0,-2911.0,-1.2,...,0.038534,0.043819,-0.000419,0.018125,0.000450,-0.005290,-0.004412,0.000886,-0.000438,0.001328
3,11.247911,-1.114232,130,211,-81,-0.4,3009.0,3302.0,2716.0,4.2,...,0.100582,0.027614,0.005440,-0.007088,0.036745,-0.023202,-0.007052,0.005474,0.004155,0.001367
4,17.413408,-2.926909,155,159,-4,0.0,-946.0,-599.0,-1293.0,-1.0,...,0.148808,0.025037,-0.000588,-0.022055,0.013852,-0.008442,0.015924,-0.003012,0.002000,-0.008345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,10.503937,1.567276,-466,-322,-144,-0.3,282.0,73.0,491.0,0.3,...,0.022589,0.079551,0.004548,0.006826,0.002271,-0.000005,0.006829,0.001137,0.001138,-0.002278
204,14.431990,1.676852,-6690,-4550,-2140,-0.4,-696.0,-987.0,-405.0,0.0,...,0.110881,0.105975,0.008989,0.013032,0.002425,0.002219,-0.000115,0.000633,0.000344,0.000025
205,15.656425,3.299519,-481,71,-552,-0.3,847.0,673.0,1021.0,0.2,...,0.047631,0.049311,0.003626,0.002820,0.007842,0.004613,0.003824,0.005808,0.000693,0.002434
206,13.354142,0.921309,-40,-75,35,0.1,-455.0,-555.0,-355.0,-0.6,...,0.049293,0.133734,-0.005776,0.012418,0.003351,-0.007659,-0.002894,0.002228,0.008761,0.012859


In [5]:
df = pd.merge(df_year_diff, df_year_pct_change)
display(df)


Unnamed: 0,AQI_2019,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",...,HWAC_MALE_ratio_2018_2019_pct_change,HWAC_FEMALE_ratio_2018_2019_pct_change,HBAC_MALE_ratio_2018_2019_pct_change,HBAC_FEMALE_ratio_2018_2019_pct_change,HIAC_MALE_ratio_2018_2019_pct_change,HIAC_FEMALE_ratio_2018_2019_pct_change,HAAC_MALE_ratio_2018_2019_pct_change,HAAC_FEMALE_ratio_2018_2019_pct_change,HNAC_MALE_ratio_2018_2019_pct_change,HNAC_FEMALE_ratio_2018_2019_pct_change
0,20.073389,1.008865,4312,6021,-1709,-0.6,-4148.0,-4526.0,-3770.0,-0.6,...,0.026476,0.031723,0.032864,0.040322,0.054139,0.074790,0.122425,-0.041309,0.055424,0.023801
1,12.958515,0.689292,5003,5641,-638,-0.5,-2312.0,-2147.0,-2477.0,-0.9,...,0.022018,0.030026,0.037618,0.016357,-0.005556,0.051458,-0.087266,0.243875,0.072299,-0.088200
2,13.125000,0.292164,-202,227,-429,-0.4,-2748.0,-2585.0,-2911.0,-1.2,...,0.025343,0.032246,-0.001476,0.063161,0.004428,-0.048117,-0.093386,0.020489,-0.010560,0.033049
3,11.247911,-1.114232,130,211,-81,-0.4,3009.0,3302.0,2716.0,4.2,...,0.042465,0.013679,0.095179,-0.115560,0.036155,-0.021558,-0.158081,0.130874,0.496745,0.122559
4,17.413408,-2.926909,155,159,-4,0.0,-946.0,-599.0,-1293.0,-1.0,...,0.008941,0.001484,-0.001274,-0.048402,0.019675,-0.012253,0.074008,-0.012205,0.018867,-0.083067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,10.503937,1.567276,-466,-322,-144,-0.3,282.0,73.0,491.0,0.3,...,0.008593,0.038840,0.049928,0.096699,0.025571,-0.000068,0.352849,0.047547,0.142779,-0.166724
204,14.431990,1.676852,-6690,-4550,-2140,-0.4,-696.0,-987.0,-405.0,0.0,...,0.015727,0.015638,0.013946,0.019648,0.007658,0.006921,-0.001563,0.007946,0.016960,0.001314
205,15.656425,3.299519,-481,71,-552,-0.3,847.0,673.0,1021.0,0.2,...,0.020747,0.021930,0.045184,0.029480,0.100848,0.057137,0.117484,0.185540,0.058094,0.251201
206,13.354142,0.921309,-40,-75,35,0.1,-455.0,-555.0,-355.0,-0.6,...,0.006963,0.020251,-0.020994,0.047957,0.007984,-0.018559,-0.020887,0.018051,0.196858,0.397275


In [6]:
# Add 2018 columns to dataset

df_norm = df_norm.drop(['GeoFIPS', 'State Name', 'County Name'], axis=1)

col_2018 = df_norm.columns[187:375]

df = pd.merge(df, df_norm[col_2018])


Decide to only have a df retaining features of difference and percent changes
because these column are stationarized random variables and behave similar to independent random variables
whihc will give us more suitable results for modeling

### Perform eda to decide whether we need to standardize or normalize
### Then split data before standardize and normalize


Can tell we need to standardize already by seeing some columns with varying means
especially between Civilian_labor_force_2017_2018_diff with mean 2500 and Unemployment_rate_2017_2018_diff	-.4

In [7]:
# Run Shapiro test on all features and count number of features with a p score less than .05 \
# Features with a shapiro p score less than .05 are considered not gaussian

count = 0
for feature in df.columns:
    shapiro_test = stats.shapiro(df[feature])
    if shapiro_test.pvalue < 0.05:
        count = count + 1
print(" {} features were deemed to not have a normal distribution out of {} features".format(count, len(df)))

 1244 features were deemed to not have a normal distribution out of 208 features


In [8]:
# sns.distplot(x)
shapiro_test = stats.shapiro(df['AQI_2019'])
shapiro_test.pvalue

2.866333004281074e-10

### Result
* Standardization of data should be performed because features have discrepancy in magnitudes from looking at means of some features
* Normalization of data should be performed because majority of the features are reported wht non gaussian distributions

## Split data

In [9]:
X = df.drop('AQI_2019', axis=1)
y = df['AQI_2019']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Scale Data

In [11]:
# checking whether we have negative observations

for cols in df.columns.tolist()[1:]:
     data = df.loc[df[cols] > 0]
data

Unnamed: 0,AQI_2019,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",...,HWAC_MALE_ratio_2018,HWAC_FEMALE_ratio_2018,HBAC_MALE_ratio_2018,HBAC_FEMALE_ratio_2018,HIAC_MALE_ratio_2018,HIAC_FEMALE_ratio_2018,HAAC_MALE_ratio_2018,HAAC_FEMALE_ratio_2018,HNAC_MALE_ratio_2018,HNAC_FEMALE_ratio_2018
0,20.073389,1.008865,4312,6021,-1709,-0.6,-4148.0,-4526.0,-3770.0,-0.6,...,1.881780,1.642027,0.173180,0.182885,0.057474,0.049588,0.018804,0.014255,0.016833,0.013497
1,12.958515,0.689292,5003,5641,-638,-0.5,-2312.0,-2147.0,-2477.0,-0.9,...,2.302427,2.110876,0.255856,0.291550,0.104359,0.095639,0.033787,0.039237,0.018256,0.018528
2,13.125000,0.292164,-202,227,-429,-0.4,-2748.0,-2585.0,-2911.0,-1.2,...,1.520503,1.358916,0.283880,0.286971,0.101543,0.109932,0.047240,0.043266,0.041500,0.040176
3,11.247911,-1.114232,130,211,-81,-0.4,3009.0,3302.0,2716.0,4.2,...,2.368571,2.018653,0.057158,0.061340,1.016297,1.076243,0.044611,0.041823,0.008365,0.011153
4,17.413408,-2.926909,155,159,-4,0.0,-946.0,-599.0,-1293.0,-1.0,...,16.642539,16.876696,0.461985,0.455657,0.704053,0.689022,0.215171,0.246814,0.106003,0.100466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,10.503937,1.567276,-466,-322,-144,-0.3,282.0,73.0,491.0,0.3,...,2.628852,2.048205,0.091082,0.070589,0.088805,0.069450,0.019355,0.023909,0.007970,0.013662
204,14.431990,1.676852,-6690,-4550,-2140,-0.4,-696.0,-987.0,-405.0,0.0,...,7.050284,6.776885,0.644583,0.663274,0.316695,0.320602,0.073498,0.079728,0.020275,0.018797
205,15.656425,3.299519,-481,71,-552,-0.3,847.0,673.0,1021.0,0.2,...,2.295775,2.248573,0.080244,0.095647,0.077760,0.080741,0.032545,0.031303,0.011925,0.009689
206,13.354142,0.921309,-40,-75,35,0.1,-455.0,-555.0,-355.0,-0.6,...,7.079351,6.603955,0.275123,0.258939,0.419764,0.412684,0.138573,0.123401,0.044505,0.032367


In [12]:
# because we negative observation we cannot perform logarithmic scale and will instead perform standardization
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[ 2.70551905,  0.2327869 ,  0.16039692, ...,  0.73126611,
         0.33814272,  0.13515283],
       [-0.53530159, -0.08904047,  0.2148679 , ...,  1.66518729,
         1.19793835,  1.24820206],
       [ 0.77647735, -0.25122218, -0.38406301, ..., -0.65992905,
        -0.66346052, -0.64236676],
       ...,
       [-1.3393787 , -0.37625047, -0.53810594, ..., -0.39892099,
        -0.59093489, -0.57175185],
       [ 1.17720255, -0.26859427, -0.45315121, ..., -0.65292137,
        -0.65261229, -0.69127578],
       [-0.55905886,  0.21034268,  0.10105353, ..., -0.41542781,
        -0.52562127, -0.51581242]])

In [13]:
# scale test data using the scaler fitted from the training set

X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 1.17998294, -0.04909735,  0.08943472, ..., -0.49835965,
        -0.59863848, -0.46814034],
       [-0.51420716, -0.15561232, -0.27711999, ...,  0.02858419,
        -0.02457733, -0.01189596],
       [-0.43934205,  0.09875557,  0.40214312, ..., -0.62340591,
        -0.89072557, -0.85107061],
       ...,
       [ 0.03883044, -0.31677961, -0.43553559, ...,  0.34577355,
         0.65650346,  0.47933231],
       [-0.33949094,  0.96304845,  1.14674643, ..., -0.10691837,
        -0.21937037, -0.11472616],
       [ 0.05234839, -0.10793419, -0.26974892, ..., -0.03763552,
        -0.09835352, -0.08431688]])

In [14]:
# retain X_train_scale as a dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
display(X_train_scaled)

# retain X_test_scale as a dataframe
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
display(X_test_scaled)

Unnamed: 0,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",90% CI LB percent_2017_2018_diff,...,HWAC_MALE_ratio_2018,HWAC_FEMALE_ratio_2018,HBAC_MALE_ratio_2018,HBAC_FEMALE_ratio_2018,HIAC_MALE_ratio_2018,HIAC_FEMALE_ratio_2018,HAAC_MALE_ratio_2018,HAAC_FEMALE_ratio_2018,HNAC_MALE_ratio_2018,HNAC_FEMALE_ratio_2018
29,2.705519,0.232787,0.160397,0.184132,-0.708151,-0.032929,-0.052244,-0.012659,-0.358545,-0.408037,...,-0.265623,-0.239267,-0.478360,-0.454617,-0.068953,-0.060159,0.727062,0.731266,0.338143,0.135153
19,-0.535302,-0.089040,0.214868,-0.808003,-1.896018,-0.366625,-0.255945,-0.478881,-0.358545,-0.272170,...,1.966411,1.747236,0.166351,0.127846,1.967471,1.962093,1.699092,1.665187,1.197938,1.248202
55,0.776477,-0.251222,-0.384063,0.364611,-0.114218,0.171585,0.192251,0.148936,0.144708,0.203365,...,-0.672893,-0.661165,-0.178878,-0.228808,-0.770089,-0.794749,-0.693480,-0.659929,-0.663461,-0.642367
93,0.021271,-0.363063,-0.509996,0.406645,0.182749,0.164637,0.192520,0.134550,0.144708,0.271299,...,0.612703,0.475619,0.255883,0.181816,0.435487,0.295619,-0.183886,-0.152873,0.157864,0.111195
181,0.105938,0.161016,0.018098,0.375286,0.776682,-0.053591,-0.097600,-0.007554,-0.358545,-0.408037,...,-0.779422,-0.761439,-0.542726,-0.507942,-0.881762,-0.876552,-0.721289,-0.695615,-0.577650,-0.600115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.497668,0.973573,0.912621,0.124417,0.479716,-0.753437,-0.800489,-0.699322,-0.861798,-0.883572,...,-0.422983,-0.395521,0.338929,0.261525,-0.683352,-0.676884,-0.550425,-0.545502,-0.561014,-0.537119
14,0.352561,-0.026907,0.191880,-0.583155,-0.411185,-1.187242,-1.037827,-1.333915,-0.789905,-0.679772,...,0.332722,0.343431,0.038756,0.040298,0.290977,0.296688,2.306325,2.301282,1.658731,1.537832
92,-1.339379,-0.376250,-0.538106,0.447011,0.479716,0.511314,0.493196,0.526518,1.726361,1.562038,...,-0.369061,-0.356314,-0.039043,0.065903,-0.094597,-0.083728,-0.452073,-0.398921,-0.590935,-0.571752
179,1.177203,-0.268594,-0.453151,0.503390,0.776682,0.239330,0.237250,0.239805,0.432282,0.407166,...,-0.801500,-0.772961,-0.597171,-0.555449,0.951338,0.820300,-0.587588,-0.652921,-0.652612,-0.691276


Unnamed: 0,AQI_2017_2018_diff,Civilian_labor_force_2017_2018_diff,Employed_2017_2018_diff,Unemployed_2017_2018_diff,Unemployment_rate_2017_2018_diff,"Poverty Estimate, All Ages_2017_2018_diff",90% CI LB All Ages_2017_2018_diff,90% CI UB All Ages_2017_2018_diff,"Poverty Percent, All Ages_2017_2018_diff",90% CI LB percent_2017_2018_diff,...,HWAC_MALE_ratio_2018,HWAC_FEMALE_ratio_2018,HBAC_MALE_ratio_2018,HBAC_FEMALE_ratio_2018,HIAC_MALE_ratio_2018,HIAC_FEMALE_ratio_2018,HAAC_MALE_ratio_2018,HAAC_FEMALE_ratio_2018,HNAC_MALE_ratio_2018,HNAC_FEMALE_ratio_2018
161,1.179983,-0.049097,0.089435,-0.367981,-1.005118,-0.012268,-0.026032,0.002099,-0.070972,-0.136303,...,-0.322966,-0.350746,-0.325545,-0.323909,0.387984,0.39348,-0.52371,-0.49836,-0.598638,-0.46814
15,-0.514207,-0.155612,-0.27712,0.330583,-0.708151,0.15522,0.167292,0.141604,0.000922,-0.000435,...,-0.321775,-0.304197,-0.540054,-0.544634,0.062488,-0.037553,0.10784,0.028584,-0.024577,-0.011896
73,-0.439342,0.098756,0.402143,-0.814008,-1.005118,-0.896515,-0.687143,-1.10744,-0.789905,-0.611838,...,-0.651537,-0.654134,-0.057482,-0.040492,-0.855003,-0.845808,-0.659433,-0.623406,-0.890726,-0.851071
96,0.772088,-0.380181,-0.551224,0.471697,-0.114218,0.145986,0.148416,0.142439,-0.214758,-0.27217,...,-0.88296,-0.855283,-0.588543,-0.537553,-0.887276,-0.93734,-0.744554,-0.698361,-0.847878,-0.951074
166,-0.232902,-0.437497,-0.518741,0.234172,-1.599052,0.165825,0.159956,0.170748,0.216602,0.135432,...,-0.932549,-0.898605,-0.589867,-0.553735,-1.025117,-1.003201,-0.874986,-0.83773,-0.975171,-0.965515
9,1.005701,0.734548,0.49322,0.61548,1.073649,0.342821,0.230988,0.45644,0.072815,-0.136303,...,0.678221,0.431181,-0.084002,-0.104289,2.280474,1.757984,0.414438,0.330372,0.38562,0.320694
100,-0.677657,-0.381323,-0.557096,0.484374,0.776682,0.045695,0.01673,0.075425,-0.718011,-0.883572,...,-0.932809,-0.897397,-0.664508,-0.63172,-1.015975,-0.996387,-0.825328,-0.820296,-1.0696,-0.937393
135,0.119467,-0.633535,-0.372569,-0.671893,0.182749,-1.402544,-1.288405,-1.511103,-0.358545,-0.27217,...,0.017514,0.116367,3.298175,3.480676,0.444916,0.627653,0.434853,0.596136,0.834306,1.091849
18,2.321711,-0.424943,-0.535482,0.311902,-0.411185,0.283487,0.207012,0.360838,0.647961,0.203365,...,3.640662,3.595297,-0.00726,0.009218,1.475927,1.546005,2.439127,2.590405,1.329744,1.435957
148,-0.946933,-0.373714,-0.513369,0.38763,-1.302085,0.174327,0.198245,0.148287,0.144708,0.271299,...,-0.8999,-0.916755,-0.496788,-0.618729,-0.911233,-1.017279,-0.864162,-0.871,-1.063651,-0.959654


### save X and y and scaled data


In [18]:
X_path = r'../../data/train_test/X_alt'
X.to_csv(X_path, index=False)

X_train_scaled_path = r'../../data/train_test/X_train_scaled_alt'
X_train_scaled.to_csv(X_train_scaled_path, index=False)

X_test_scaled_path = r'../../data/train_test/X_test_scaled_alt'
X_test_scaled.to_csv(X_test_scaled_path, index=False)

y_path = r'../../data/train_test/y_alt'
y.to_csv(y_path, index=False)

y_train_path = r'../../data/train_test/y_train_alt'
y_train.to_csv(y_train_path, index=False)

y_test_path = r'../../data/train_test/y_test_alt'
y_test.to_csv(y_test_path, index=False)

df_path = r'../../data/processed/df_standardized.csv'
df.to_csv(df_path, index=False)

In [16]:
# param_grid = {'alpha':np.arange(0.1,2, step=0.1)}
# lasso = Lasso()
# lasso_grid = GridSearchCV(lasso, param_grid, cv=5)
# lasso_grid.fit(X_train, y_train)
# print('r2 / variance: ', lasso_grid.best_score_, 'best params', lasso_grid.best_params_)
# print('best estimator', lasso_grid.best_estimator_)

In [17]:
lasso = Lasso(alpha=1.0)
lasso.fit(X_train_scaled, y_train)
lasso.score(X_train_scaled, y_train)

0.33139496642885813

## results
When choosing LASSO and without 2018 individual variables, the training score yielded was .71.
However when including the 2018 individual variables, the training score jumped to .78.
With scaled train and test data it appears that with adding 2018 individual columns we go from .32 to .33;
which is only a small improvement for large number of columns added. Which means we will not add 2018 individual columns to
our X.
Note cross validation was not used to output these scores