## tasks
### 1: Form training and testing periods as discussed (2 test periods, last 2*365 days). Be careful tp scale data only based on data from the training set. You may use (from Python) MinMaxScaler, standardisation (StandardScaler) or the default scaler in Python's elastic net.
### 2:Estimate the current flu rates by training an elastic net model. Use a Pearson correlation filter (r > 0.3) on the training data to reduce the amount of queries prior to training an elastic net (reminder: not all 1000 queries I provided are related to flu!). Report performance on the two test sets using three metrics: mean absolute error, root mean squared error and Pearson's correlation.
### 3: If there is time, begin work on traditional forecasting models (you've identified seasonal ARIMA and Hult-Winters).

In [6]:
# import libraries
import numpy as np
import pandas as pd
import random
import csv
import scipy.stats as stats
import seaborn as sns
from collections import Counter
from collections import defaultdict
import math
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from random import randint

import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm

# initialize the plotting sizes
# set size
plt.rc('figure', figsize=(10, 5))
# subplots size
fizsize_with_subplots = (10, 10)
# histogram size
bin_size = 10

In [7]:
# Loading data
dates = pd.read_csv('data/dates.csv',header=None)
queries = pd.read_csv('data/queries.csv',header=None)
X = pd.read_csv('data/X.csv',header=None)
y = pd.read_csv('data/y.csv',header=None)

X.columns = queries

In [8]:
# data preprocessing and train-test split
# this data is well formed with no missing value and other symbols or labels that are non numerical.
# splitting the data into train and test

# Here we want the first 500 queries
X = X.iloc[:, 0: 250]

# first with the last year as test
# for convinience, pick the two validation set starting from 1/3 and 2/3 of the training set,
# 10% of training set as validation, here use 400 data points, two 200 periods.
test_size =365
length = X.shape[0]
l =(X.shape[0]-test_size)//3

train1_X = pd.concat([X[0:l],X[l+200:2*l],X[2*l+200:length-test_size]])
val1_X = pd.concat([X[l:l+200],X[2*l:2*l+200]])
train1_y = pd.concat([y[0:l],y[l+200:2*l],y[2*l+200:length-test_size]])
val1_y = pd.concat([y[l:l+200],y[2*l:2*l+200]])
test1_X = X[length-test_size:]
test1_y = y[length-test_size:]

# second with the last 2 year as test
test_size2 =365*2
l2 =(X.shape[0]-test_size2)//3

train2_X = pd.concat([X[0:l2],X[l2+180:2*l2],X[2*l2+180:length-test_size2]])
val2_X = pd.concat([X[l2:l2+180],X[2*l2:2*l2+180]])
train2_y = pd.concat([y[0:l2],y[l2+180:2*l2],y[2*l2+180:length-test_size2]])
val2_y = pd.concat([y[l2:l2+180],y[2*l2:2*l2+180]])
test2_X = X[length-test_size2:]
test2_y = y[length-test_size2:]

print('X shape: ',X.shape,'  y shape: ',y.shape)
print('train1 X:',train1_X.shape,' train1 y:',train1_y.shape,' Test1 X shape:',test1_X.shape, ' Test1 y shape:',test1_y.shape)
print('validation1 X:',val1_X.shape,' validation1 y:',val1_y.shape)
print('train2 X:',train2_X.shape,' train2 y:',train2_y.shape,' Test2 X:',test2_X.shape,' Test2 y:',test2_y.shape)
print('validation2 X:',val2_X.shape,' validation2 y:',val2_y.shape)

X shape:  (4383, 250)   y shape:  (4383, 1)
train1 X: (3618, 250)  train1 y: (3618, 1)  Test1 X shape: (365, 250)  Test1 y shape: (365, 1)
validation1 X: (400, 250)  validation1 y: (400, 1)
train2 X: (3293, 250)  train2 y: (3293, 1)  Test2 X: (730, 250)  Test2 y: (730, 1)
validation2 X: (360, 250)  validation2 y: (360, 1)


In [9]:
train_size = train1_X.shape[0]
corrs = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs[i] = 0
    else:
        corrs[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [10]:
def corr_filter(df_X,df_Xval,df_Xtest,corrs,threshold):
    X = df_X.copy()
    X_test = df_Xtest.copy()
    X_val = df_Xval.copy()
    col_corr = set() # Set of all the names of deleted columns
    for i in range(0,250):
            if corrs[i,0] < threshold: 
                #print(X.columns[i])
                colname = df_X.columns[i]
                del X[colname] # deleting the column from the dataset
                del X_test[colname] 
                del X_val[colname]

    return X,X_val,X_test

## Fixing pearson correlation filter r>=0.2

In [11]:
#train1_X0,test1_X0 = corr_filter(train1_X,test1_X,corrs,0.1)
train1_X1,val1_X1,test1_X1 = corr_filter(train1_X,val1_X,test1_X,corrs,0.2)
#train1_X2,test1_X2 = corr_filter(train1_X,test1_X,corrs,0.3)
#train1_X3,test1_X3 = corr_filter(train1_X,test1_X,corrs,0.4)

In [12]:
#print('For correlation filter r>=0.1, we select feature number: ',train1_X0.shape[1])
print('For correlation filter r>=0.2, we select feature number: ',train1_X1.shape[1])
#print('For correlation filter r>=0.3, we select feature number: ',train1_X2.shape[1])
#print('For correlation filter r>=0.4, we select feature number: ',train1_X3.shape[1])

For correlation filter r>=0.2, we select feature number:  157


In [13]:
# Defien the mearure matrics, MAE, RMSE, CORR
# define three metrics: mean absolute error, root mean squared error and Pearson's correlation.
from sklearn.metrics import mean_absolute_error
# mae = mean_absolute_error(y_actual, y_pred)

from sklearn.metrics import mean_squared_error
from math import sqrt
# rmse = sqrt(mean_squared_error(y_actual, y_pred))

# np.correcoef returns Pearson product-moment correlation coefficients
def pearson_r(x,y):   
    corr_mat = np.corrcoef(x,y)
    return corr_mat[0,1]
# r = pearson_r(y_actual,y_pred)


# Generalise the function for convinient tuning
def eNet(a,l,train_X,train_y,test_X,test_y):
    
    # scaling and modeling
    scalerX = StandardScaler()
    scalerX.fit(train_X)
    train_X = scalerX.transform(train_X)
    test_X = scalerX.transform(test_X)
    
    scalery = StandardScaler() 
    scalery.fit(train_y) 
    train_y = scalery.transform(train_y) 
    
    
    alpha = a
    l1_ratio = l
    enet = ElasticNet(alpha=a, l1_ratio=l, normalize=False,max_iter=10000)
    enet.fit(train_X,train_y)
    print('Nonzero weights: %d from %d' % (len(np.nonzero(enet.coef_)[0]), len(enet.coef_)))
    y_pred1 = enet.predict(test_X)
    y_pred1 = scalery.inverse_transform(y_pred1)

    mae1 = mean_absolute_error(test_y, y_pred1)
    #print('The mean absolute error is: ',mae1)

    rmse1 = sqrt(mean_squared_error(test_y, y_pred1))
    #print('The root mean squared error is: ',rmse1)
    
    corr_y = test_y.copy()
    corr_y['y_act'] = test_y
    corr_y['y_pred']= y_pred1
    corr = np.corrcoef(corr_y['y_act'],corr_y['y_pred'])[0,1]
    #print('The correlation is: ',corr)
    
    return rmse1, mae1,corr
    
    


para = [10]
para_l=[0.3]
print('For correlation filter r>=0.1, 310 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X0,train1_y,test1_X0,test1_y))

In [14]:
np

<module 'numpy' from '/home/yiyangsu/anaconda3/lib/python3.6/site-packages/numpy/__init__.py'>

In [15]:
## Fixing l1-ratio = 0.3, only tuning alpha.
para = np.arange(0.01, 2.0, 0.01)
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.2, 212 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train1_X1,train1_y,val1_X1,val1_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')
        

For correlation filter r>=0.2, 212 features are selected
alpha: 0.01  L1-ratio: 0.3
Nonzero weights: 122 from 157
Best RMSE is updated! 
The mean absolute error is:  18.758344355984708
The root mean squared error is:  36.48630170603103
The correlation is:  0.8177579535794238
----------------------------------------
alpha: 0.02  L1-ratio: 0.3
Nonzero weights: 109 from 157
alpha: 0.03  L1-ratio: 0.3
Nonzero weights: 87 from 157
Best RMSE is updated! 
The mean absolute error is:  18.396001477018206
The root mean squared error is:  34.961866255421214
The correlation is:  0.8563961712787699
----------------------------------------
alpha: 0.04  L1-ratio: 0.3
Nonzero weights: 82 from 157
Best RMSE is updated! 
The mean absolute error is:  17.23346852174038
The root mean squared error is:  32.3694644378473
The correlation is:  0.8629858157804526
----------------------------------------
alpha: 0.05  L1-ratio: 0.3
Nonzero weights: 75 from 157
Best RMSE is updated! 
The mean absolute error is:  1

Best RMSE is updated! 
The mean absolute error is:  9.625115422156027
The root mean squared error is:  16.143106760749962
The correlation is:  0.8852878678802256
----------------------------------------
alpha: 0.33  L1-ratio: 0.3
Nonzero weights: 26 from 157
Best RMSE is updated! 
The mean absolute error is:  9.567697716684656
The root mean squared error is:  16.013473792450352
The correlation is:  0.8853855098086727
----------------------------------------
alpha: 0.34  L1-ratio: 0.3
Nonzero weights: 25 from 157
Best RMSE is updated! 
The mean absolute error is:  9.506475620488049
The root mean squared error is:  15.87766271096325
The correlation is:  0.8854972727976004
----------------------------------------
alpha: 0.35000000000000003  L1-ratio: 0.3
Nonzero weights: 25 from 157
Best RMSE is updated! 
The mean absolute error is:  9.445367001788318
The root mean squared error is:  15.740895754842752
The correlation is:  0.8856164022722868
----------------------------------------
alpha:

Nonzero weights: 14 from 157
Best RMSE is updated! 
The mean absolute error is:  8.326437515653598
The root mean squared error is:  13.284551097778348
The correlation is:  0.8860745844551048
----------------------------------------
alpha: 0.62  L1-ratio: 0.3
Nonzero weights: 13 from 157
Best RMSE is updated! 
The mean absolute error is:  8.288125666126179
The root mean squared error is:  13.210395828585446
The correlation is:  0.8860522704042763
----------------------------------------
alpha: 0.63  L1-ratio: 0.3
Nonzero weights: 13 from 157
Best RMSE is updated! 
The mean absolute error is:  8.24948105868982
The root mean squared error is:  13.139174665844013
The correlation is:  0.8860164543559025
----------------------------------------
alpha: 0.64  L1-ratio: 0.3
Nonzero weights: 13 from 157
Best RMSE is updated! 
The mean absolute error is:  8.210741736625524
The root mean squared error is:  13.069266614803476
The correlation is:  0.8859782994391998
---------------------------------

alpha: 0.98  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 0.99  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.0  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.01  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.02  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.03  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.04  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.05  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.06  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.07  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.08  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.09  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.1  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.11  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.12  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.1300000000000001  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.1400000000000001  L1-ratio: 0.3
Nonzero weights: 9 from 157
alpha: 1.1500000000000001  L1-ratio: 0.

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 150 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X2,train1_y,test1_X2,test1_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 103 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X3,train1_y,test1_X3,test1_y))

In [16]:
## tuned alpha based on validation set, then measure the performance for test set.
## Fixing l1-ratio = 0.3
para = [0.9]
para_l=[0.3]
rmse=100000
mae=1000000
corr=0

for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train1_X1,train1_y,test1_X1,test1_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

For correlation filter r>=0.2, 212 features are selected
alpha: 0.9  L1-ratio: 0.3
Nonzero weights: 11 from 157
Best RMSE is updated! 
The mean absolute error is:  4.025636980566922
The root mean squared error is:  4.544224744424967
The correlation is:  0.8391743685708768
----------------------------------------


## For the last 2 years as the testing set

In [22]:
train_size = 3653
corrs2 = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs2[i] = 0
    else:
        corrs2[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs2.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [23]:
#train2_X0,test2_X0 = corr_filter(train2_X,test2_X,corrs2,0.1)
train2_X1,val2_X1,test2_X1 = corr_filter(train2_X,val2_X,test2_X,corrs2,0.2)
#train2_X2,test2_X2 = corr_filter(train2_X,test2_X,corrs2,0.3)
#train2_X3,test2_X3 = corr_filter(train2_X,test2_X,corrs2,0.4)

In [24]:
#print('For correlation filter r>0.1, we select feature number: ',train2_X0.shape[1])
print('For correlation filter r>0.2, we select feature number: ',train2_X1.shape[1])
#print('For correlation filter r>0.3, we select feature number: ',train2_X2.shape[1])
#print('For correlation filter r>0.4, we select feature number: ',train2_X3.shape[1])

For correlation filter r>0.2, we select feature number:  157


para = range(1,100)
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.1, 303 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train2_X1,train2_y,val2_X1,val2_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

In [25]:
para = np.arange(0.01, 3.0, 0.01)
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.1, 303 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train2_X1,train2_y,val2_X1,val2_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

For correlation filter r>=0.1, 303 features are selected
alpha: 0.01  L1-ratio: 0.3
Nonzero weights: 118 from 157
Best RMSE is updated! 
The mean absolute error is:  4.978415541845191
The root mean squared error is:  10.9539269209476
The correlation is:  0.35416707307360035
----------------------------------------
alpha: 0.02  L1-ratio: 0.3
Nonzero weights: 100 from 157
alpha: 0.03  L1-ratio: 0.3
Nonzero weights: 85 from 157
alpha: 0.04  L1-ratio: 0.3
Nonzero weights: 76 from 157
alpha: 0.05  L1-ratio: 0.3
Nonzero weights: 74 from 157
alpha: 0.060000000000000005  L1-ratio: 0.3
Nonzero weights: 68 from 157
alpha: 0.06999999999999999  L1-ratio: 0.3
Nonzero weights: 64 from 157
alpha: 0.08  L1-ratio: 0.3
Nonzero weights: 63 from 157
alpha: 0.09  L1-ratio: 0.3
Nonzero weights: 62 from 157
alpha: 0.09999999999999999  L1-ratio: 0.3
Nonzero weights: 57 from 157
alpha: 0.11  L1-ratio: 0.3
Nonzero weights: 54 from 157
alpha: 0.12  L1-ratio: 0.3
Nonzero weights: 52 from 157
alpha: 0.13  L1-ratio

Nonzero weights: 19 from 157
Best RMSE is updated! 
The mean absolute error is:  6.93032233202379
The root mean squared error is:  10.59790946492945
The correlation is:  0.11258273545552278
----------------------------------------
alpha: 1.21  L1-ratio: 0.3
Nonzero weights: 19 from 157
Best RMSE is updated! 
The mean absolute error is:  6.9287941160659
The root mean squared error is:  10.563010188935229
The correlation is:  0.11245054424704508
----------------------------------------
alpha: 1.22  L1-ratio: 0.3
Nonzero weights: 18 from 157
Best RMSE is updated! 
The mean absolute error is:  6.9269015670461105
The root mean squared error is:  10.527083549290108
The correlation is:  0.11238622615068126
----------------------------------------
alpha: 1.23  L1-ratio: 0.3
Nonzero weights: 18 from 157
Best RMSE is updated! 
The mean absolute error is:  6.9247830474141185
The root mean squared error is:  10.4906491655136
The correlation is:  0.11236070312030016
--------------------------------

Nonzero weights: 13 from 157
Best RMSE is updated! 
The mean absolute error is:  6.818954651604112
The root mean squared error is:  9.401072353428555
The correlation is:  0.10933996417103217
----------------------------------------
alpha: 1.59  L1-ratio: 0.3
Nonzero weights: 13 from 157
Best RMSE is updated! 
The mean absolute error is:  6.814965759116218
The root mean squared error is:  9.372910188831426
The correlation is:  0.10926223108214268
----------------------------------------
alpha: 1.6  L1-ratio: 0.3
Nonzero weights: 13 from 157
Best RMSE is updated! 
The mean absolute error is:  6.81089482631245
The root mean squared error is:  9.344843213823687
The correlation is:  0.10919072512048152
----------------------------------------
alpha: 1.61  L1-ratio: 0.3
Nonzero weights: 13 from 157
Best RMSE is updated! 
The mean absolute error is:  6.806801285916216
The root mean squared error is:  9.317023294785386
The correlation is:  0.10912007506588046
----------------------------------

Nonzero weights: 8 from 157
Best RMSE is updated! 
The mean absolute error is:  6.679036135574057
The root mean squared error is:  8.61177518926514
The correlation is:  0.10628508238259964
----------------------------------------
alpha: 1.94  L1-ratio: 0.3
Nonzero weights: 8 from 157
Best RMSE is updated! 
The mean absolute error is:  6.676381449218707
The root mean squared error is:  8.5973969510922
The correlation is:  0.10606124078424635
----------------------------------------
alpha: 1.95  L1-ratio: 0.3
Nonzero weights: 8 from 157
Best RMSE is updated! 
The mean absolute error is:  6.67372744127321
The root mean squared error is:  8.58325942715768
The correlation is:  0.10583271788009783
----------------------------------------
alpha: 1.96  L1-ratio: 0.3
Nonzero weights: 8 from 157
Best RMSE is updated! 
The mean absolute error is:  6.671074018919027
The root mean squared error is:  8.569362108851335
The correlation is:  0.10559934638375625
----------------------------------------


Nonzero weights: 5 from 157
Best RMSE is updated! 
The mean absolute error is:  6.5866351410025725
The root mean squared error is:  8.251325226945804
The correlation is:  0.0974426376381228
----------------------------------------
alpha: 2.27  L1-ratio: 0.3
Nonzero weights: 5 from 157
Best RMSE is updated! 
The mean absolute error is:  6.5817311493511435
The root mean squared error is:  8.241631808727421
The correlation is:  0.09762875849522568
----------------------------------------
alpha: 2.28  L1-ratio: 0.3
Nonzero weights: 5 from 157
Best RMSE is updated! 
The mean absolute error is:  6.57683901932232
The root mean squared error is:  8.232240397539169
The correlation is:  0.0978239364684254
----------------------------------------
alpha: 2.29  L1-ratio: 0.3
Nonzero weights: 5 from 157
Best RMSE is updated! 
The mean absolute error is:  6.571958701575948
The root mean squared error is:  8.22314978049691
The correlation is:  0.09802880857947699
--------------------------------------

Nonzero weights: 3 from 157
alpha: 2.63  L1-ratio: 0.3
Nonzero weights: 3 from 157
alpha: 2.6399999999999997  L1-ratio: 0.3
Nonzero weights: 3 from 157
alpha: 2.65  L1-ratio: 0.3
Nonzero weights: 3 from 157
alpha: 2.6599999999999997  L1-ratio: 0.3
Nonzero weights: 2 from 157
alpha: 2.67  L1-ratio: 0.3
Nonzero weights: 2 from 157
alpha: 2.6799999999999997  L1-ratio: 0.3
Nonzero weights: 2 from 157
alpha: 2.69  L1-ratio: 0.3
Nonzero weights: 2 from 157
alpha: 2.6999999999999997  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.71  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.7199999999999998  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.73  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.7399999999999998  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.75  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.76  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.77  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2.78  L1-ratio: 0.3
Nonzero weights: 1 from 157
alpha: 2

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 154 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X2,train2_y,test2_X2,test2_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 105 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X3,train2_y,test2_X3,test2_y))

In [26]:
## tuned alpha based on validation set, then measure the performance for test set.
## Fixing l1-ratio = 0.3
para = [2.60]
para_l=[0.3]
rmse=100000
mae=1000000
corr=0
print('For correlation filter r>=0.2, 212 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        rmse0,mae0,corr0 = eNet(a,l,train2_X1,train2_y,test2_X1,test2_y)
        if rmse0< rmse:
            rmse=rmse0
            mae=mae0
            corr = corr0
            print('========================================')
            print('Best RMSE is updated! ' )
            print('The mean absolute error is: ',mae)
            print('The root mean squared error is: ',rmse)
            print('The correlation is: ',corr)
            print('----------------------------------------')

For correlation filter r>=0.2, 212 features are selected
alpha: 2.6  L1-ratio: 0.3
Nonzero weights: 3 from 157
Best RMSE is updated! 
The mean absolute error is:  5.806283856375737
The root mean squared error is:  6.670485083096372
The correlation is:  0.7505757759942205
----------------------------------------
