## tasks
### 1: Form training and testing periods as discussed (2 test periods, last 2*365 days). Be careful tp scale data only based on data from the training set. You may use (from Python) MinMaxScaler, standardisation (StandardScaler) or the default scaler in Python's elastic net.
### 2:Estimate the current flu rates by training an elastic net model. Use a Pearson correlation filter (r > 0.3) on the training data to reduce the amount of queries prior to training an elastic net (reminder: not all 1000 queries I provided are related to flu!). Report performance on the two test sets using three metrics: mean absolute error, root mean squared error and Pearson's correlation.
### 3: If there is time, begin work on traditional forecasting models (you've identified seasonal ARIMA and Hult-Winters).

In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import csv
import scipy.stats as stats
import seaborn as sns
from collections import Counter
from collections import defaultdict
import math
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from random import randint
from sklearn.linear_model import Ridge
from sklearn import linear_model

import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm

# initialize the plotting sizes
# set size
plt.rc('figure', figsize=(10, 5))
# subplots size
fizsize_with_subplots = (10, 10)
# histogram size
bin_size = 10

In [2]:
# Loading data
dates = pd.read_csv('data/dates.csv',header=None)
queries = pd.read_csv('data/queries.csv',header=None)
X = pd.read_csv('data/X.csv',header=None)
y = pd.read_csv('data/y.csv',header=None)

X.columns = queries

In [3]:
# data preprocessing and train-test split
# this data is well formed with no missing value and other symbols or labels that are non numerical.
# splitting the data into train and test

# Here we want the first 500 queries
X = X.iloc[:, 0: 250]

# first with the last year as test
# for convinience, pick the two validation set starting from 1/3 and 2/3 of the training set,
# 10% of training set as validation, here use 400 data points, two 200 periods.
test_size =365
length = X.shape[0]
l =(X.shape[0]-test_size)//3

train1_X = pd.concat([X[0:l],X[l+200:2*l],X[2*l+200:length-test_size]])
val1_X = pd.concat([X[l:l+200],X[2*l:2*l+200]])
train1_y = pd.concat([y[0:l],y[l+200:2*l],y[2*l+200:length-test_size]])
val1_y = pd.concat([y[l:l+200],y[2*l:2*l+200]])
test1_X = X[length-test_size:]
test1_y = y[length-test_size:]

# second with the last 2 year as test
test_size2 =365*2
l2 =(X.shape[0]-test_size2)//3

train2_X = pd.concat([X[0:l2],X[l2+180:2*l2],X[2*l2+180:length-test_size2]])
val2_X = pd.concat([X[l2:l2+180],X[2*l2:2*l2+180]])
train2_y = pd.concat([y[0:l2],y[l2+180:2*l2],y[2*l2+180:length-test_size2]])
val2_y = pd.concat([y[l2:l2+180],y[2*l2:2*l2+180]])
test2_X = X[length-test_size2:]
test2_y = y[length-test_size2:]

print('X shape: ',X.shape,'  y shape: ',y.shape)
print('train1 X:',train1_X.shape,' train1 y:',train1_y.shape,' Test1 X shape:',test1_X.shape, ' Test1 y shape:',test1_y.shape)
print('validation1 X:',val1_X.shape,' validation1 y:',val1_y.shape)
print('train2 X:',train2_X.shape,' train2 y:',train2_y.shape,' Test2 X:',test2_X.shape,' Test2 y:',test2_y.shape)
print('validation2 X:',val2_X.shape,' validation2 y:',val2_y.shape)

X shape:  (4383, 250)   y shape:  (4383, 1)
train1 X: (3618, 250)  train1 y: (3618, 1)  Test1 X shape: (365, 250)  Test1 y shape: (365, 1)
validation1 X: (400, 250)  validation1 y: (400, 1)
train2 X: (3293, 250)  train2 y: (3293, 1)  Test2 X: (730, 250)  Test2 y: (730, 1)
validation2 X: (360, 250)  validation2 y: (360, 1)


In [4]:
train_size = train1_X.shape[0]
corrs = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs[i] = 0
    else:
        corrs[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [5]:
def corr_filter(df_X,df_Xval,df_Xtest,corrs,threshold):
    X = df_X.copy()
    X_test = df_Xtest.copy()
    X_val = df_Xval.copy()
    col_corr = set() # Set of all the names of deleted columns
    for i in range(0,250):
            if corrs[i,0] < threshold: 
                #print(X.columns[i])
                colname = df_X.columns[i]
                del X[colname] # deleting the column from the dataset
                del X_test[colname] 
                del X_val[colname]

    return X,X_val,X_test

## Fixing pearson correlation filter r>=0.2

In [6]:
#train1_X0,test1_X0 = corr_filter(train1_X,test1_X,corrs,0.1)
train1_X1,val1_X1,test1_X1 = corr_filter(train1_X,val1_X,test1_X,corrs,0.2)
#train1_X2,test1_X2 = corr_filter(train1_X,test1_X,corrs,0.3)
#train1_X3,test1_X3 = corr_filter(train1_X,test1_X,corrs,0.4)

In [7]:
#print('For correlation filter r>=0.1, we select feature number: ',train1_X0.shape[1])
print('For correlation filter r>=0.2, we select feature number: ',train1_X1.shape[1])
#print('For correlation filter r>=0.3, we select feature number: ',train1_X2.shape[1])
#print('For correlation filter r>=0.4, we select feature number: ',train1_X3.shape[1])

For correlation filter r>=0.2, we select feature number:  157


In [8]:
# Defien the mearure matrics, MAE, RMSE, CORR
# define three metrics: mean absolute error, root mean squared error and Pearson's correlation.
from sklearn.metrics import mean_absolute_error
# mae = mean_absolute_error(y_actual, y_pred)

from sklearn.metrics import mean_squared_error
from math import sqrt
# rmse = sqrt(mean_squared_error(y_actual, y_pred))

# np.correcoef returns Pearson product-moment correlation coefficients
def pearson_r(x,y):   
    corr_mat = np.corrcoef(x,y)
    return corr_mat[0,1]
# r = pearson_r(y_actual,y_pred)


# Generalise the function for convinient tuning
def ridge(a,train_X,train_y,test_X,test_y):
    # scaling and modeling
    scalerX = StandardScaler()
    scalerX.fit(train_X)
    train_X = scalerX.transform(train_X)
    test_X = scalerX.transform(test_X)
    
    scalery = StandardScaler() 
    scalery.fit(train_y) 
    train_y = scalery.transform(train_y) 
    
    
    alpha=a
    ri = Ridge(alpha, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
  
    ri.fit(train_X,train_y)

    y_pred1 = ri.predict(test_X)
    y_pred1 = scalery.inverse_transform(y_pred1)

    mae1 = mean_absolute_error(test_y, y_pred1)
    #print('The mean absolute error is: ',mae1)

    rmse1 = sqrt(mean_squared_error(test_y, y_pred1))
    #print('The root mean squared error is: ',rmse1)
    
    corr_y = test_y.copy()
    corr_y['y_act'] = test_y
    corr_y['y_pred']= y_pred1
    corr = np.corrcoef(corr_y['y_act'],corr_y['y_pred'])[0,1]
    #print('The correlation is: ',corr)
    
    return rmse1, mae1,corr
    
    


# scaling and modeling
scaler = MinMaxScaler()
train1_X0_scaled = scaler.fit_transform(train1_X0)
test1_X0_scaled = scaler.transform(test1_x0)

train1_X1_scaled = scaler.fit_transform(train1_X1)
test1_X1_scaled = scaler.transform(test1_x1)

train1_X2_scaled = scaler.fit_transform(train1_X2)
test1_X2_scaled = scaler.transform(test1_x2)

train1_X3_scaled = scaler.fit_transform(train1_X3)
test1_X3_scaled = scaler.transform(test1_x3)


para = [10]
para_l=[0.3]
print('For correlation filter r>=0.1, 310 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X0,train1_y,test1_X0,test1_y))

In [9]:

para = np.arange(0, 10000000, 10000)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')
        

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  22.738477373935684
The root mean squared error is:  45.95805279347314
The correlation is:  0.7576646272128614
----------------------------------------
alpha: 10000
Best RMSE is updated! 
The mean absolute error is:  16.711505749513822
The root mean squared error is:  31.749734057273503
The correlation is:  0.8031339020365239
----------------------------------------
alpha: 20000
Best RMSE is updated! 
The mean absolute error is:  14.715536414461608
The root mean squared error is:  27.80398238959264
The correlation is:  0.7935808781893313
----------------------------------------
alpha: 30000
Best RMSE is updated! 
The mean absolute error is:  13.400096739561404
The root mean squared error is:  25.21790854319866
The correlation is:  0.7875531686540677
----------------------------------------
alpha: 40000
Best RMSE is updated! 
The mean absolute error is:  12.377312840804953
The root mean squared error is:  23.287918384884467
The

alpha: 2760000
alpha: 2770000
alpha: 2780000
alpha: 2790000
alpha: 2800000
alpha: 2810000
alpha: 2820000
alpha: 2830000
alpha: 2840000
alpha: 2850000
alpha: 2860000
alpha: 2870000
alpha: 2880000
alpha: 2890000
alpha: 2900000
alpha: 2910000
alpha: 2920000
alpha: 2930000
alpha: 2940000
alpha: 2950000
alpha: 2960000
alpha: 2970000
alpha: 2980000
alpha: 2990000
alpha: 3000000
alpha: 3010000
alpha: 3020000
alpha: 3030000
alpha: 3040000
alpha: 3050000
alpha: 3060000
alpha: 3070000
alpha: 3080000
alpha: 3090000
alpha: 3100000
alpha: 3110000
alpha: 3120000
alpha: 3130000
alpha: 3140000
alpha: 3150000
alpha: 3160000
alpha: 3170000
alpha: 3180000
alpha: 3190000
alpha: 3200000
alpha: 3210000
alpha: 3220000
alpha: 3230000
alpha: 3240000
alpha: 3250000
alpha: 3260000
alpha: 3270000
alpha: 3280000
alpha: 3290000
alpha: 3300000
alpha: 3310000
alpha: 3320000
alpha: 3330000
alpha: 3340000
alpha: 3350000
alpha: 3360000
alpha: 3370000
alpha: 3380000
alpha: 3390000
alpha: 3400000
alpha: 3410000
alpha: 342

alpha: 8370000
alpha: 8380000
alpha: 8390000
alpha: 8400000
alpha: 8410000
alpha: 8420000
alpha: 8430000
alpha: 8440000
alpha: 8450000
alpha: 8460000
alpha: 8470000
alpha: 8480000
alpha: 8490000
alpha: 8500000
alpha: 8510000
alpha: 8520000
alpha: 8530000
alpha: 8540000
alpha: 8550000
alpha: 8560000
alpha: 8570000
alpha: 8580000
alpha: 8590000
alpha: 8600000
alpha: 8610000
alpha: 8620000
alpha: 8630000
alpha: 8640000
alpha: 8650000
alpha: 8660000
alpha: 8670000
alpha: 8680000
alpha: 8690000
alpha: 8700000
alpha: 8710000
alpha: 8720000
alpha: 8730000
alpha: 8740000
alpha: 8750000
alpha: 8760000
alpha: 8770000
alpha: 8780000
alpha: 8790000
alpha: 8800000
alpha: 8810000
alpha: 8820000
alpha: 8830000
alpha: 8840000
alpha: 8850000
alpha: 8860000
alpha: 8870000
alpha: 8880000
alpha: 8890000
alpha: 8900000
alpha: 8910000
alpha: 8920000
alpha: 8930000
alpha: 8940000
alpha: 8950000
alpha: 8960000
alpha: 8970000
alpha: 8980000
alpha: 8990000
alpha: 9000000
alpha: 9010000
alpha: 9020000
alpha: 903

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 150 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X2,train1_y,test1_X2,test1_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 103 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X3,train1_y,test1_X3,test1_y))

In [10]:
para =[0,170000]
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  22.738477373935684
The root mean squared error is:  45.95805279347314
The correlation is:  0.7576646272128614
----------------------------------------
alpha: 170000
Best RMSE is updated! 
The mean absolute error is:  9.551547496559987
The root mean squared error is:  16.48333838435159
The correlation is:  0.7688020686719569
----------------------------------------


## For the last 2 years as the testing set

In [12]:
train_size = 3653
corrs2 = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs2[i] = 0
    else:
        corrs2[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs2.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [13]:
#train2_X0,test2_X0 = corr_filter(train2_X,test2_X,corrs2,0.1)
train2_X1,val2_X1,test2_X1 = corr_filter(train2_X,val2_X,test2_X,corrs2,0.2)
#train2_X2,test2_X2 = corr_filter(train2_X,test2_X,corrs2,0.3)
#train2_X3,test2_X3 = corr_filter(train2_X,test2_X,corrs2,0.4)

In [14]:
#print('For correlation filter r>0.1, we select feature number: ',train2_X0.shape[1])
print('For correlation filter r>0.2, we select feature number: ',train2_X1.shape[1])
#print('For correlation filter r>0.3, we select feature number: ',train2_X2.shape[1])
#print('For correlation filter r>0.4, we select feature number: ',train2_X3.shape[1])

For correlation filter r>0.2, we select feature number:  157


# scaling and modeling
scaler = MinMaxScaler()
train2_X0_scaled = scaler.fit_transform(train2_X0)
test2_X0_scaled = scaler.transform(test2_X0)

train2_X1_scaled = scaler.fit_transform(train2_X1)
test2_X1_scaled = scaler.transform(test2_X1)

train2_X2_scaled = scaler.fit_transform(train2_X2)
test2_X2_scaled = scaler.transform(test2_X2)

train2_X3_scaled = scaler.fit_transform(train2_X3)
test2_X3_scaled = scaler.transform(test2_X3)


para = range(1,100)
para = np.arange(0.01, 10.0, 0.01)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

In [15]:

para =np.arange(0, 10000000, 10000)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train2_X1,train2_y,val2_X1,val2_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  4.82414118172931
The root mean squared error is:  10.315163209348514
The correlation is:  0.38850181146732626
----------------------------------------
alpha: 10000
alpha: 20000
alpha: 30000
alpha: 40000
alpha: 50000
alpha: 60000
alpha: 70000
alpha: 80000
alpha: 90000
alpha: 100000
alpha: 110000
alpha: 120000
alpha: 130000
alpha: 140000
alpha: 150000
alpha: 160000
alpha: 170000
alpha: 180000
alpha: 190000
alpha: 200000
alpha: 210000
alpha: 220000
alpha: 230000
alpha: 240000
alpha: 250000
alpha: 260000
alpha: 270000
alpha: 280000
alpha: 290000
alpha: 300000
alpha: 310000
alpha: 320000
alpha: 330000
alpha: 340000
Best RMSE is updated! 
The mean absolute error is:  7.088104967771647
The root mean squared error is:  10.244669138668918
The correlation is:  0.11774946173914073
----------------------------------------
alpha: 350000
Best RMSE is updated! 
The mean absolute error is:  7.074819950359034
The root mean squared error is:  

Best RMSE is updated! 
The mean absolute error is:  6.82109885449531
The root mean squared error is:  8.8893937652596
The correlation is:  0.11584895669283876
----------------------------------------
alpha: 670000
Best RMSE is updated! 
The mean absolute error is:  6.815999044611132
The root mean squared error is:  8.86884277846605
The correlation is:  0.11581822464802352
----------------------------------------
alpha: 680000
Best RMSE is updated! 
The mean absolute error is:  6.811013496910155
The root mean squared error is:  8.848994219836788
The correlation is:  0.11578837826842543
----------------------------------------
alpha: 690000
Best RMSE is updated! 
The mean absolute error is:  6.806138457963281
The root mean squared error is:  8.829817771705004
The correlation is:  0.11575937980563132
----------------------------------------
alpha: 700000
Best RMSE is updated! 
The mean absolute error is:  6.801370334455151
The root mean squared error is:  8.811284694930363
The correlation

Best RMSE is updated! 
The mean absolute error is:  6.693040469878374
The root mean squared error is:  8.449268149092987
The correlation is:  0.11513046579242571
----------------------------------------
alpha: 1020000
Best RMSE is updated! 
The mean absolute error is:  6.690482358391927
The root mean squared error is:  8.442115479682514
The correlation is:  0.11511708419713308
----------------------------------------
alpha: 1030000
Best RMSE is updated! 
The mean absolute error is:  6.68796565441425
The root mean squared error is:  8.435147414840207
The correlation is:  0.11510395892660547
----------------------------------------
alpha: 1040000
Best RMSE is updated! 
The mean absolute error is:  6.685489364940418
The root mean squared error is:  8.428357964063181
The correlation is:  0.11509108268529447
----------------------------------------
alpha: 1050000
Best RMSE is updated! 
The mean absolute error is:  6.683052528302849
The root mean squared error is:  8.421741372085597
The corr

Best RMSE is updated! 
The mean absolute error is:  6.6277175474989845
The root mean squared error is:  8.28893545880898
The correlation is:  0.11480101405735955
----------------------------------------
alpha: 1340000
Best RMSE is updated! 
The mean absolute error is:  6.62611264045242
The root mean squared error is:  8.285586640666098
The correlation is:  0.11479322762298891
----------------------------------------
alpha: 1350000
Best RMSE is updated! 
The mean absolute error is:  6.624528409210485
The root mean squared error is:  8.28230899690883
The correlation is:  0.11478555534491282
----------------------------------------
alpha: 1360000
Best RMSE is updated! 
The mean absolute error is:  6.622964457701634
The root mean squared error is:  8.279100643929711
The correlation is:  0.11477799473083315
----------------------------------------
alpha: 1370000
Best RMSE is updated! 
The mean absolute error is:  6.62147368374526
The root mean squared error is:  8.27595975841432
The correla

Best RMSE is updated! 
The mean absolute error is:  6.584619135617426
The root mean squared error is:  8.206008473897814
The correlation is:  0.11458814839737055
----------------------------------------
alpha: 1680000
Best RMSE is updated! 
The mean absolute error is:  6.5836083336663
The root mean squared error is:  8.204300542389504
The correlation is:  0.11458318103783226
----------------------------------------
alpha: 1690000
Best RMSE is updated! 
The mean absolute error is:  6.582608218251222
The root mean squared error is:  8.202623495743149
The correlation is:  0.11457827197349073
----------------------------------------
alpha: 1700000
Best RMSE is updated! 
The mean absolute error is:  6.5816186210385945
The root mean squared error is:  8.200976648741564
The correlation is:  0.11457342018410446
----------------------------------------
alpha: 1710000
Best RMSE is updated! 
The mean absolute error is:  6.580639377206045
The root mean squared error is:  8.19935933450267
The corre

Best RMSE is updated! 
The mean absolute error is:  6.551129656670763
The root mean squared error is:  8.156374556684288
The correlation is:  0.11442663381529372
----------------------------------------
alpha: 2080000
Best RMSE is updated! 
The mean absolute error is:  6.5504429230866865
The root mean squared error is:  8.155507074647401
The correlation is:  0.1144233864686241
----------------------------------------
alpha: 2090000
Best RMSE is updated! 
The mean absolute error is:  6.549762186378345
The root mean squared error is:  8.154653139420175
The correlation is:  0.11442016998677862
----------------------------------------
alpha: 2100000
Best RMSE is updated! 
The mean absolute error is:  6.5490873684080535
The root mean squared error is:  8.1538124973148
The correlation is:  0.11441698393179228
----------------------------------------
alpha: 2110000
Best RMSE is updated! 
The mean absolute error is:  6.548418392388384
The root mean squared error is:  8.152984900341856
The corr

Best RMSE is updated! 
The mean absolute error is:  6.5276726809670285
The root mean squared error is:  8.130172290602825
The correlation is:  0.11431713190858794
----------------------------------------
alpha: 2480000
Best RMSE is updated! 
The mean absolute error is:  6.527175980963466
The root mean squared error is:  8.12969387288434
The correlation is:  0.11431484435358161
----------------------------------------
alpha: 2490000
Best RMSE is updated! 
The mean absolute error is:  6.5266829747568735
The root mean squared error is:  8.129222147895625
The correlation is:  0.11431257506790116
----------------------------------------
alpha: 2500000
Best RMSE is updated! 
The mean absolute error is:  6.526193621321449
The root mean squared error is:  8.128757006601058
The correlation is:  0.11431032383355486
----------------------------------------
alpha: 2510000
Best RMSE is updated! 
The mean absolute error is:  6.5257078802362924
The root mean squared error is:  8.128298342080171
The c

Best RMSE is updated! 
The mean absolute error is:  6.51032931629696
The root mean squared error is:  8.115344550507798
The correlation is:  0.11423800232044906
----------------------------------------
alpha: 2880000
Best RMSE is updated! 
The mean absolute error is:  6.5099534693174546
The root mean squared error is:  8.11506600579509
The correlation is:  0.11423630431690462
----------------------------------------
alpha: 2890000
Best RMSE is updated! 
The mean absolute error is:  6.50958005601408
The root mean squared error is:  8.114791061915074
The correlation is:  0.11423461800650413
----------------------------------------
alpha: 2900000
Best RMSE is updated! 
The mean absolute error is:  6.509209052836208
The root mean squared error is:  8.114519666640245
The correlation is:  0.11423294326887483
----------------------------------------
alpha: 2910000
Best RMSE is updated! 
The mean absolute error is:  6.508840436535949
The root mean squared error is:  8.11425176863697
The correl

Best RMSE is updated! 
The mean absolute error is:  6.496985823899671
The root mean squared error is:  8.106566745076961
The correlation is:  0.11417814753591249
----------------------------------------
alpha: 3280000
Best RMSE is updated! 
The mean absolute error is:  6.4966915566131345
The root mean squared error is:  8.10639893014637
The correlation is:  0.1141768374017439
----------------------------------------
alpha: 3290000
Best RMSE is updated! 
The mean absolute error is:  6.496398976526931
The root mean squared error is:  8.106233179259775
The correlation is:  0.11417553519749028
----------------------------------------
alpha: 3300000
Best RMSE is updated! 
The mean absolute error is:  6.496108069177088
The root mean squared error is:  8.106069465265229
The correlation is:  0.11417424085137177
----------------------------------------
alpha: 3310000
Best RMSE is updated! 
The mean absolute error is:  6.4958188202644145
The root mean squared error is:  8.1059077614277
The corre

Best RMSE is updated! 
The mean absolute error is:  6.487603615209769
The root mean squared error is:  8.10176325923576
The correlation is:  0.11413658270587485
----------------------------------------
alpha: 3630000
Best RMSE is updated! 
The mean absolute error is:  6.4873607616993345
The root mean squared error is:  8.101653913872115
The correlation is:  0.11413551245274797
----------------------------------------
alpha: 3640000
Best RMSE is updated! 
The mean absolute error is:  6.487119173743481
The root mean squared error is:  8.10154588856457
The correlation is:  0.11413444805713648
----------------------------------------
alpha: 3650000
Best RMSE is updated! 
The mean absolute error is:  6.486878841478147
The root mean squared error is:  8.101439167164
The correlation is:  0.11413338947108372
----------------------------------------
alpha: 3660000
Best RMSE is updated! 
The mean absolute error is:  6.486639755141474
The root mean squared error is:  8.101333733749334
The correla

Best RMSE is updated! 
The mean absolute error is:  6.47918858780529
The root mean squared error is:  8.098415038622102
The correlation is:  0.11409966163262546
----------------------------------------
alpha: 4010000
Best RMSE is updated! 
The mean absolute error is:  6.478987665082819
The root mean squared error is:  8.098346182537913
The correlation is:  0.11409878418523493
----------------------------------------
alpha: 4020000
Best RMSE is updated! 
The mean absolute error is:  6.478787695141265
The root mean squared error is:  8.098278166206073
The correlation is:  0.11409791108778354
----------------------------------------
alpha: 4030000
Best RMSE is updated! 
The mean absolute error is:  6.478588671221036
The root mean squared error is:  8.098210980017246
The correlation is:  0.11409704230800405
----------------------------------------
alpha: 4040000
Best RMSE is updated! 
The mean absolute error is:  6.4783905866263165
The root mean squared error is:  8.098144614487222
The cor

Best RMSE is updated! 
The mean absolute error is:  6.471833435668706
The root mean squared error is:  8.096231176007896
The correlation is:  0.11406766460962595
----------------------------------------
alpha: 4410000
Best RMSE is updated! 
The mean absolute error is:  6.471665919589961
The root mean squared error is:  8.096189498564087
The correlation is:  0.11406693881315748
----------------------------------------
alpha: 4420000
Best RMSE is updated! 
The mean absolute error is:  6.471499129104505
The root mean squared error is:  8.09614835813711
The correlation is:  0.1140662162902428
----------------------------------------
alpha: 4430000
Best RMSE is updated! 
The mean absolute error is:  6.471333059509066
The root mean squared error is:  8.096107748927547
The correlation is:  0.11406549701878448
----------------------------------------
alpha: 4440000
Best RMSE is updated! 
The mean absolute error is:  6.471167706140924
The root mean squared error is:  8.096067665206009
The corre

Best RMSE is updated! 
The mean absolute error is:  6.465655726255957
The root mean squared error is:  8.09493146956648
The correlation is:  0.11404098458063472
----------------------------------------
alpha: 4810000
Best RMSE is updated! 
The mean absolute error is:  6.465513929565648
The root mean squared error is:  8.094907361702928
The correlation is:  0.11404037426584156
----------------------------------------
alpha: 4820000
Best RMSE is updated! 
The mean absolute error is:  6.4653726981236534
The root mean squared error is:  8.094883605166999
The correlation is:  0.1140397664759799
----------------------------------------
alpha: 4830000
Best RMSE is updated! 
The mean absolute error is:  6.465232028557325
The root mean squared error is:  8.094860196334054
The correlation is:  0.11403916119541306
----------------------------------------
alpha: 4840000
Best RMSE is updated! 
The mean absolute error is:  6.465091917520789
The root mean squared error is:  8.09483713162039
The corre

Best RMSE is updated! 
The mean absolute error is:  6.460638221334049
The root mean squared error is:  8.094234553842728
The correlation is:  0.1140194446237199
----------------------------------------
alpha: 5190000
Best RMSE is updated! 
The mean absolute error is:  6.460515740868938
The root mean squared error is:  8.094221558333196
The correlation is:  0.1140189202606091
----------------------------------------
alpha: 5200000
Best RMSE is updated! 
The mean absolute error is:  6.460393714289002
The root mean squared error is:  8.094208801263669
The correlation is:  0.11401839790874407
----------------------------------------
alpha: 5210000
Best RMSE is updated! 
The mean absolute error is:  6.460272139076248
The root mean squared error is:  8.094196280251735
The correlation is:  0.11401787755657504
----------------------------------------
alpha: 5220000
Best RMSE is updated! 
The mean absolute error is:  6.4601510127312745
The root mean squared error is:  8.094183992940417
The corr

Best RMSE is updated! 
The mean absolute error is:  6.45656780731134
The root mean squared error is:  8.09390131229005
The correlation is:  0.11400175707332326
----------------------------------------
alpha: 5550000
Best RMSE is updated! 
The mean absolute error is:  6.456466067120857
The root mean squared error is:  8.093895564671728
The correlation is:  0.11400129842294125
----------------------------------------
alpha: 5560000
Best RMSE is updated! 
The mean absolute error is:  6.456364680330323
The root mean squared error is:  8.093889983782635
The correlation is:  0.11400084141814375
----------------------------------------
alpha: 5570000
Best RMSE is updated! 
The mean absolute error is:  6.456263645101826
The root mean squared error is:  8.093884567989162
The correlation is:  0.11400038605009079
----------------------------------------
alpha: 5580000
Best RMSE is updated! 
The mean absolute error is:  6.456162959610169
The root mean squared error is:  8.093879315674327
The corre

Best RMSE is updated! 
The mean absolute error is:  6.453387444213954
The root mean squared error is:  8.093791398923036
The correlation is:  0.11398744470519834
----------------------------------------
alpha: 5880000
Best RMSE is updated! 
The mean absolute error is:  6.453296460968868
The root mean squared error is:  8.09379037432941
The correlation is:  0.11398703601452051
----------------------------------------
alpha: 5890000
Best RMSE is updated! 
The mean absolute error is:  6.4532057766333
The root mean squared error is:  8.093789470392998
The correlation is:  0.11398662870822197
----------------------------------------
alpha: 5900000
Best RMSE is updated! 
The mean absolute error is:  6.453115389736802
The root mean squared error is:  8.093788685940783
The correlation is:  0.11398622277928012
----------------------------------------
alpha: 5910000
Best RMSE is updated! 
The mean absolute error is:  6.453025298818557
The root mean squared error is:  8.09378801981124
The correla

alpha: 9720000
alpha: 9730000
alpha: 9740000
alpha: 9750000
alpha: 9760000
alpha: 9770000
alpha: 9780000
alpha: 9790000
alpha: 9800000
alpha: 9810000
alpha: 9820000
alpha: 9830000
alpha: 9840000
alpha: 9850000
alpha: 9860000
alpha: 9870000
alpha: 9880000
alpha: 9890000
alpha: 9900000
alpha: 9910000
alpha: 9920000
alpha: 9930000
alpha: 9940000
alpha: 9950000
alpha: 9960000
alpha: 9970000
alpha: 9980000
alpha: 9990000


para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 154 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X2,train2_y,test2_X2,test2_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 105 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X3,train2_y,test2_X3,test2_y))

In [16]:
## tuned alpha based on validation set, then measure the performance for test set.
## Fixing l1-ratio = 0.3
para = [0,5960000]
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  22.738477373935684
The root mean squared error is:  45.95805279347314
The correlation is:  0.7576646272128614
----------------------------------------
alpha: 5960000
Best RMSE is updated! 
The mean absolute error is:  13.977523937370156
The root mean squared error is:  27.349926569941225
The correlation is:  0.7620075297251234
----------------------------------------
