## tasks
### 1: Form training and testing periods as discussed (2 test periods, last 2*365 days). Be careful tp scale data only based on data from the training set. You may use (from Python) MinMaxScaler, standardisation (StandardScaler) or the default scaler in Python's elastic net.
### 2:Estimate the current flu rates by training an elastic net model. Use a Pearson correlation filter (r > 0.3) on the training data to reduce the amount of queries prior to training an elastic net (reminder: not all 1000 queries I provided are related to flu!). Report performance on the two test sets using three metrics: mean absolute error, root mean squared error and Pearson's correlation.
### 3: If there is time, begin work on traditional forecasting models (you've identified seasonal ARIMA and Hult-Winters).

In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import csv
import scipy.stats as stats
import seaborn as sns
from collections import Counter
from collections import defaultdict
import math
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from random import randint

import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm

# initialize the plotting sizes
# set size
plt.rc('figure', figsize=(10, 5))
# subplots size
fizsize_with_subplots = (10, 10)
# histogram size
bin_size = 10

In [2]:
# Loading data
dates = pd.read_csv('data/dates.csv',header=None)
queries = pd.read_csv('data/queries.csv',header=None)
X = pd.read_csv('data/X.csv',header=None)
y = pd.read_csv('data/y.csv',header=None)

X.columns = queries

In [3]:
# data preprocessing and train-test split
# this data is well formed with no missing value and other symbols or labels that are non numerical.
# splitting the data into train and test

# Here we want the first 500 queries
X = X.iloc[:, 0: 250]

# first with the last year as test
# for convinience, pick the two validation set starting from 1/3 and 2/3 of the training set,
# 10% of training set as validation, here use 400 data points, two 200 periods.
test_size =365
length = X.shape[0]
l =(X.shape[0]-test_size)//3

train1_X = pd.concat([X[0:l],X[l+200:2*l],X[2*l+200:length-test_size]])
val1_X = pd.concat([X[l:l+200],X[2*l:2*l+200]])
train1_y = pd.concat([y[0:l],y[l+200:2*l],y[2*l+200:length-test_size]])
val1_y = pd.concat([y[l:l+200],y[2*l:2*l+200]])
test1_X = X[length-test_size:]
test1_y = y[length-test_size:]

# second with the last 2 year as test
test_size2 =365*2
l2 =(X.shape[0]-test_size2)//3

train2_X = pd.concat([X[0:l2],X[l2+180:2*l2],X[2*l2+180:length-test_size2]])
val2_X = pd.concat([X[l2:l2+180],X[2*l2:2*l2+180]])
train2_y = pd.concat([y[0:l2],y[l2+180:2*l2],y[2*l2+180:length-test_size2]])
val2_y = pd.concat([y[l2:l2+180],y[2*l2:2*l2+180]])
test2_X = X[length-test_size2:]
test2_y = y[length-test_size2:]

print('X shape: ',X.shape,'  y shape: ',y.shape)
print('train1 X:',train1_X.shape,' train1 y:',train1_y.shape,' Test1 X shape:',test1_X.shape, ' Test1 y shape:',test1_y.shape)
print('validation1 X:',val1_X.shape,' validation1 y:',val1_y.shape)
print('train2 X:',train2_X.shape,' train2 y:',train2_y.shape,' Test2 X:',test2_X.shape,' Test2 y:',test2_y.shape)
print('validation2 X:',val2_X.shape,' validation2 y:',val2_y.shape)

X shape:  (4383, 250)   y shape:  (4383, 1)
train1 X: (3618, 250)  train1 y: (3618, 1)  Test1 X shape: (365, 250)  Test1 y shape: (365, 1)
validation1 X: (400, 250)  validation1 y: (400, 1)
train2 X: (3293, 250)  train2 y: (3293, 1)  Test2 X: (730, 250)  Test2 y: (730, 1)
validation2 X: (360, 250)  validation2 y: (360, 1)


In [4]:
train_size = train1_X.shape[0]
corrs = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs[i] = 0
    else:
        corrs[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [5]:
def corr_filter(df_X,df_Xval,df_Xtest,corrs,threshold):
    X = df_X.copy()
    X_test = df_Xtest.copy()
    X_val = df_Xval.copy()
    col_corr = set() # Set of all the names of deleted columns
    for i in range(0,250):
            if corrs[i,0] < threshold: 
                #print(X.columns[i])
                colname = df_X.columns[i]
                del X[colname] # deleting the column from the dataset
                del X_test[colname] 
                del X_val[colname]

    return X,X_val,X_test

## Fixing pearson correlation filter r>=0.2

In [6]:
#train1_X0,test1_X0 = corr_filter(train1_X,test1_X,corrs,0.1)
train1_X1,val1_X1,test1_X1 = corr_filter(train1_X,val1_X,test1_X,corrs,0.2)
#train1_X2,test1_X2 = corr_filter(train1_X,test1_X,corrs,0.3)
#train1_X3,test1_X3 = corr_filter(train1_X,test1_X,corrs,0.4)

In [7]:
#print('For correlation filter r>=0.1, we select feature number: ',train1_X0.shape[1])
print('For correlation filter r>=0.2, we select feature number: ',train1_X1.shape[1])
#print('For correlation filter r>=0.3, we select feature number: ',train1_X2.shape[1])
#print('For correlation filter r>=0.4, we select feature number: ',train1_X3.shape[1])

For correlation filter r>=0.2, we select feature number:  157


In [8]:
# Defien the mearure matrics, MAE, RMSE, CORR
# define three metrics: mean absolute error, root mean squared error and Pearson's correlation.
from sklearn.metrics import mean_absolute_error
# mae = mean_absolute_error(y_actual, y_pred)

from sklearn.metrics import mean_squared_error
from math import sqrt
# rmse = sqrt(mean_squared_error(y_actual, y_pred))

# np.correcoef returns Pearson product-moment correlation coefficients
def pearson_r(x,y):   
    corr_mat = np.corrcoef(x,y)
    return corr_mat[0,1]
# r = pearson_r(y_actual,y_pred)


# Generalise the function for convinient tuning
def eNet(a,l,train_X,train_y,test_X,test_y):
    alpha = a
    l1_ratio = l
    enet = ElasticNet(alpha=a, l1_ratio=l, normalize=False,max_iter=10000)
    enet.fit(train_X,train_y)
    print('Nonzero weights: %d from %d' % (len(np.nonzero(enet.coef_)[0]), len(enet.coef_)))
    y_pred1 = enet.predict(test_X)

    mae1 = mean_absolute_error(test_y, y_pred1)
    #print('The mean absolute error is: ',mae1)

    rmse1 = sqrt(mean_squared_error(test_y, y_pred1))
    #print('The root mean squared error is: ',rmse1)
    
    corr_y = test_y.copy()
    corr_y['y_act'] = test_y
    corr_y['y_pred']= y_pred1
    corr = np.corrcoef(corr_y['y_act'],corr_y['y_pred'])[0,1]
    #print('The correlation is: ',corr)
    
    return rmse1, mae1,corr
    
    


# scaling and modeling
scaler = MinMaxScaler()
train1_X0_scaled = scaler.fit_transform(train1_X0)
test1_X0_scaled = scaler.transform(test1_x0)

train1_X1_scaled = scaler.fit_transform(train1_X1)
test1_X1_scaled = scaler.transform(test1_x1)

train1_X2_scaled = scaler.fit_transform(train1_X2)
test1_X2_scaled = scaler.transform(test1_x2)

train1_X3_scaled = scaler.fit_transform(train1_X3)
test1_X3_scaled = scaler.transform(test1_x3)


para = [10]
para_l=[0.3]
print('For correlation filter r>=0.1, 310 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X0,train1_y,test1_X0,test1_y))

In [11]:
## Fixing l1-ratio = 0.3, only tuning alpha.
para = range(1,400)
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.2, 212 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train1_X1,train1_y,val1_X1,val1_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')
        

For correlation filter r>=0.2, 212 features are selected
alpha: 1  L1-ratio: 0.3
Nonzero weights: 136 from 157
Best RMSE is updated! 
The mean absolute error is:  22.999130057518933
The root mean squared error is:  46.118053398873684
The correlation is:  0.7760919674903088
----------------------------------------
alpha: 2  L1-ratio: 0.3
Nonzero weights: 120 from 157
Best RMSE is updated! 
The mean absolute error is:  22.53179426896336
The root mean squared error is:  44.76688615909705
The correlation is:  0.7950867960507111
----------------------------------------
alpha: 3  L1-ratio: 0.3
Nonzero weights: 110 from 157
Best RMSE is updated! 
The mean absolute error is:  21.890202271058264
The root mean squared error is:  43.13786725712112
The correlation is:  0.8115744866082267
----------------------------------------
alpha: 4  L1-ratio: 0.3
Nonzero weights: 100 from 157
Best RMSE is updated! 
The mean absolute error is:  21.406409317158094
The root mean squared error is:  41.81016314820

Nonzero weights: 19 from 157
Best RMSE is updated! 
The mean absolute error is:  18.418757563860332
The root mean squared error is:  33.507469809247986
The correlation is:  0.9186345876774353
----------------------------------------
alpha: 54  L1-ratio: 0.3
Nonzero weights: 19 from 157
Best RMSE is updated! 
The mean absolute error is:  18.390699841476202
The root mean squared error is:  33.44224610471042
The correlation is:  0.9190952532444169
----------------------------------------
alpha: 55  L1-ratio: 0.3
Nonzero weights: 19 from 157
Best RMSE is updated! 
The mean absolute error is:  18.362511568241455
The root mean squared error is:  33.37735616315034
The correlation is:  0.919547887661917
----------------------------------------
alpha: 56  L1-ratio: 0.3
Nonzero weights: 19 from 157
Best RMSE is updated! 
The mean absolute error is:  18.335484468002154
The root mean squared error is:  33.31535061489667
The correlation is:  0.9199823900692474
--------------------------------------

Nonzero weights: 10 from 157
Best RMSE is updated! 
The mean absolute error is:  15.995383721388352
The root mean squared error is:  28.346941460504254
The correlation is:  0.9288970464299726
----------------------------------------
alpha: 88  L1-ratio: 0.3
Nonzero weights: 10 from 157
Best RMSE is updated! 
The mean absolute error is:  15.938536785356828
The root mean squared error is:  28.222788240263103
The correlation is:  0.9290177764146377
----------------------------------------
alpha: 89  L1-ratio: 0.3
Nonzero weights: 10 from 157
Best RMSE is updated! 
The mean absolute error is:  15.882405308744557
The root mean squared error is:  28.100206515847237
The correlation is:  0.9291331097952784
----------------------------------------
alpha: 90  L1-ratio: 0.3
Nonzero weights: 9 from 157
Best RMSE is updated! 
The mean absolute error is:  15.8277187231316
The root mean squared error is:  27.98039835117985
The correlation is:  0.9292374278773471
--------------------------------------

Nonzero weights: 9 from 157
Best RMSE is updated! 
The mean absolute error is:  14.308492414318357
The root mean squared error is:  24.71529686161643
The correlation is:  0.9308125389654994
----------------------------------------
alpha: 120  L1-ratio: 0.3
Nonzero weights: 9 from 157
Best RMSE is updated! 
The mean absolute error is:  14.253694004081858
The root mean squared error is:  24.60205756806264
The correlation is:  0.9308426358714439
----------------------------------------
alpha: 121  L1-ratio: 0.3
Nonzero weights: 9 from 157
Best RMSE is updated! 
The mean absolute error is:  14.201428198245681
The root mean squared error is:  24.4935655121127
The correlation is:  0.9308614741571984
----------------------------------------
alpha: 122  L1-ratio: 0.3
Nonzero weights: 9 from 157
Best RMSE is updated! 
The mean absolute error is:  14.146275859611821
The root mean squared error is:  24.380388431395826
The correlation is:  0.9308870043822925
---------------------------------------

Nonzero weights: 7 from 157
Best RMSE is updated! 
The mean absolute error is:  12.84818704700862
The root mean squared error is:  21.804094753846826
The correlation is:  0.9305579761881636
----------------------------------------
alpha: 151  L1-ratio: 0.3
Nonzero weights: 7 from 157
Best RMSE is updated! 
The mean absolute error is:  12.812105967176858
The root mean squared error is:  21.74055202186074
The correlation is:  0.930601924468667
----------------------------------------
alpha: 152  L1-ratio: 0.3
Nonzero weights: 7 from 157
Best RMSE is updated! 
The mean absolute error is:  12.77367548778896
The root mean squared error is:  21.672918080292117
The correlation is:  0.9306480493009518
----------------------------------------
alpha: 153  L1-ratio: 0.3
Nonzero weights: 7 from 157
Best RMSE is updated! 
The mean absolute error is:  12.738041978126962
The root mean squared error is:  21.609940756029506
The correlation is:  0.9306870142901658
---------------------------------------

Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.861541961198833
The root mean squared error is:  20.13027210958394
The correlation is:  0.9309553050672208
----------------------------------------
alpha: 181  L1-ratio: 0.3
Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.831923189652588
The root mean squared error is:  20.079704057164275
The correlation is:  0.9309373188385057
----------------------------------------
alpha: 182  L1-ratio: 0.3
Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.803283731886868
The root mean squared error is:  20.02939427432183
The correlation is:  0.9309172842527209
----------------------------------------
alpha: 183  L1-ratio: 0.3
Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.776601640239733
The root mean squared error is:  19.979293963017973
The correlation is:  0.930895184087817
--------------------------------------

Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.07861843256925
The root mean squared error is:  18.4307687081021
The correlation is:  0.9288559058743205
----------------------------------------
alpha: 218  L1-ratio: 0.3
Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.061850630921505
The root mean squared error is:  18.39003838320743
The correlation is:  0.9287563930160219
----------------------------------------
alpha: 219  L1-ratio: 0.3
Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.045258307340884
The root mean squared error is:  18.34959411853722
The correlation is:  0.92865452781619
----------------------------------------
alpha: 220  L1-ratio: 0.3
Nonzero weights: 6 from 157
Best RMSE is updated! 
The mean absolute error is:  11.028685886483638
The root mean squared error is:  18.30943623291406
The correlation is:  0.9285503011491181
----------------------------------------
al

alpha: 248  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 249  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 250  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 251  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 252  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 253  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 254  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 255  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 256  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 257  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 258  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 259  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 260  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 261  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 262  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 263  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 264  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 265  L1-ratio: 0.3
Nonzero weights: 4 from 157
alpha: 266  L1-ratio: 0.3
No

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 150 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X2,train1_y,test1_X2,test1_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 103 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X3,train1_y,test1_X3,test1_y))

In [13]:
## tuned alpha based on validation set, then measure the performance for test set.
## Fixing l1-ratio = 0.3
para = [243]
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train1_X1,train1_y,test1_X1,test1_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

alpha: 243  L1-ratio: 0.3
Nonzero weights: 4 from 157
Best RMSE is updated! 
The mean absolute error is:  3.057406461316006
The root mean squared error is:  3.4791986971928814
The correlation is:  0.8702428004449374
----------------------------------------


## For the last 2 years as the testing set

In [15]:
train_size = 3653
corrs2 = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs2[i] = 0
    else:
        corrs2[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs2.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [16]:
#train2_X0,test2_X0 = corr_filter(train2_X,test2_X,corrs2,0.1)
train2_X1,val2_X1,test2_X1 = corr_filter(train2_X,val2_X,test2_X,corrs2,0.2)
#train2_X2,test2_X2 = corr_filter(train2_X,test2_X,corrs2,0.3)
#train2_X3,test2_X3 = corr_filter(train2_X,test2_X,corrs2,0.4)

In [17]:
#print('For correlation filter r>0.1, we select feature number: ',train2_X0.shape[1])
print('For correlation filter r>0.2, we select feature number: ',train2_X1.shape[1])
#print('For correlation filter r>0.3, we select feature number: ',train2_X2.shape[1])
#print('For correlation filter r>0.4, we select feature number: ',train2_X3.shape[1])

For correlation filter r>0.2, we select feature number:  157


# scaling and modeling
scaler = MinMaxScaler()
train2_X0_scaled = scaler.fit_transform(train2_X0)
test2_X0_scaled = scaler.transform(test2_X0)

train2_X1_scaled = scaler.fit_transform(train2_X1)
test2_X1_scaled = scaler.transform(test2_X1)

train2_X2_scaled = scaler.fit_transform(train2_X2)
test2_X2_scaled = scaler.transform(test2_X2)

train2_X3_scaled = scaler.fit_transform(train2_X3)
test2_X3_scaled = scaler.transform(test2_X3)


para = range(1,100)
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.1, 303 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train2_X1,train2_y,val2_X1,val2_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

In [18]:
para = range(1,300)
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.1, 303 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train2_X1,train2_y,val2_X1,val2_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

For correlation filter r>=0.1, 303 features are selected
alpha: 1  L1-ratio: 0.3
Nonzero weights: 145 from 157
Best RMSE is updated! 
The mean absolute error is:  4.671115108748183
The root mean squared error is:  9.716962876951671
The correlation is:  0.405634299952857
----------------------------------------
alpha: 2  L1-ratio: 0.3
Nonzero weights: 127 from 157
Best RMSE is updated! 
The mean absolute error is:  4.546050398791747
The root mean squared error is:  9.246737052850373
The correlation is:  0.41970787208318444
----------------------------------------
alpha: 3  L1-ratio: 0.3
Nonzero weights: 115 from 157
Best RMSE is updated! 
The mean absolute error is:  4.42809703034297
The root mean squared error is:  8.834858669518795
The correlation is:  0.43396075011980706
----------------------------------------
alpha: 4  L1-ratio: 0.3
Nonzero weights: 103 from 157
Best RMSE is updated! 
The mean absolute error is:  4.321663019611135
The root mean squared error is:  8.449403727344333


Nonzero weights: 41 from 157
Best RMSE is updated! 
The mean absolute error is:  3.638778274153861
The root mean squared error is:  5.987954753522455
The correlation is:  0.6141277962338357
----------------------------------------
alpha: 30  L1-ratio: 0.3
Nonzero weights: 41 from 157
Best RMSE is updated! 
The mean absolute error is:  3.632773650197482
The root mean squared error is:  5.977764040243814
The correlation is:  0.615455933018631
----------------------------------------
alpha: 31  L1-ratio: 0.3
Nonzero weights: 41 from 157
Best RMSE is updated! 
The mean absolute error is:  3.6269387594339038
The root mean squared error is:  5.969359227925628
The correlation is:  0.6165716044000372
----------------------------------------
alpha: 32  L1-ratio: 0.3
Nonzero weights: 41 from 157
Best RMSE is updated! 
The mean absolute error is:  3.621498164887815
The root mean squared error is:  5.962544404476867
The correlation is:  0.6174945699349111
----------------------------------------
a

Nonzero weights: 11 from 157
alpha: 155  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 156  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 157  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 158  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 159  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 160  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 161  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 162  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 163  L1-ratio: 0.3
Nonzero weights: 11 from 157
alpha: 164  L1-ratio: 0.3
Nonzero weights: 10 from 157
alpha: 165  L1-ratio: 0.3
Nonzero weights: 10 from 157
alpha: 166  L1-ratio: 0.3
Nonzero weights: 10 from 157
alpha: 167  L1-ratio: 0.3
Nonzero weights: 10 from 157
alpha: 168  L1-ratio: 0.3
Nonzero weights: 10 from 157
alpha: 169  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 170  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 171  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 172  L1-ratio: 0.3
Nonzero weigh

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 154 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X2,train2_y,test2_X2,test2_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 105 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X3,train2_y,test2_X3,test2_y))

In [19]:
## tuned alpha based on validation set, then measure the performance for test set.
## Fixing l1-ratio = 0.3
para = [34]
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.2, 212 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train2_X1,train2_y,test2_X1,test2_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

For correlation filter r>=0.2, 212 features are selected
alpha: 34  L1-ratio: 0.3
Nonzero weights: 38 from 157
Best RMSE is updated! 
The mean absolute error is:  2.887066578096806
The root mean squared error is:  3.4216594534068885
The correlation is:  0.892312479509743
----------------------------------------
