## tasks
### 1: Form training and testing periods as discussed (2 test periods, last 2*365 days). Be careful tp scale data only based on data from the training set. You may use (from Python) MinMaxScaler, standardisation (StandardScaler) or the default scaler in Python's elastic net.
### 2:Estimate the current flu rates by training an elastic net model. Use a Pearson correlation filter (r > 0.3) on the training data to reduce the amount of queries prior to training an elastic net (reminder: not all 1000 queries I provided are related to flu!). Report performance on the two test sets using three metrics: mean absolute error, root mean squared error and Pearson's correlation.
### 3: If there is time, begin work on traditional forecasting models (you've identified seasonal ARIMA and Hult-Winters).

In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import csv
import scipy.stats as stats
import seaborn as sns
from collections import Counter
from collections import defaultdict
import math
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from random import randint
from sklearn.linear_model import Ridge
from sklearn import linear_model

import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm

# initialize the plotting sizes
# set size
plt.rc('figure', figsize=(10, 5))
# subplots size
fizsize_with_subplots = (10, 10)
# histogram size
bin_size = 10

In [2]:
# Loading data
dates = pd.read_csv('data/dates.csv',header=None)
queries = pd.read_csv('data/queries.csv',header=None)
X = pd.read_csv('data/X.csv',header=None)
y = pd.read_csv('data/y.csv',header=None)

X.columns = queries

In [3]:
# data preprocessing and train-test split
# this data is well formed with no missing value and other symbols or labels that are non numerical.
# splitting the data into train and test

# Here we want the first 500 queries
X = X.iloc[:, 0: 500]

# first with the last year as test
# for convinience, pick the two validation set starting from 1/3 and 2/3 of the training set,
# 10% of training set as validation, here use 400 data points, two 200 periods.
test_size =365
length = X.shape[0]
l =(X.shape[0]-test_size)//3

train1_X = pd.concat([X[0:l],X[l+200:2*l],X[2*l+200:length-test_size]])
val1_X = pd.concat([X[l:l+200],X[2*l:2*l+200]])
train1_y = pd.concat([y[0:l],y[l+200:2*l],y[2*l+200:length-test_size]])
val1_y = pd.concat([y[l:l+200],y[2*l:2*l+200]])
test1_X = X[length-test_size:]
test1_y = y[length-test_size:]

# second with the last 2 year as test
test_size2 =365*2
l2 =(X.shape[0]-test_size2)//3

train2_X = pd.concat([X[0:l2],X[l2+180:2*l2],X[2*l2+180:length-test_size2]])
val2_X = pd.concat([X[l2:l2+180],X[2*l2:2*l2+180]])
train2_y = pd.concat([y[0:l2],y[l2+180:2*l2],y[2*l2+180:length-test_size2]])
val2_y = pd.concat([y[l2:l2+180],y[2*l2:2*l2+180]])
test2_X = X[length-test_size2:]
test2_y = y[length-test_size2:]

print('X shape: ',X.shape,'  y shape: ',y.shape)
print('train1 X:',train1_X.shape,' train1 y:',train1_y.shape,' Test1 X shape:',test1_X.shape, ' Test1 y shape:',test1_y.shape)
print('validation1 X:',val1_X.shape,' validation1 y:',val1_y.shape)
print('train2 X:',train2_X.shape,' train2 y:',train2_y.shape,' Test2 X:',test2_X.shape,' Test2 y:',test2_y.shape)
print('validation2 X:',val2_X.shape,' validation2 y:',val2_y.shape)

X shape:  (4383, 500)   y shape:  (4383, 1)
train1 X: (3618, 500)  train1 y: (3618, 1)  Test1 X shape: (365, 500)  Test1 y shape: (365, 1)
validation1 X: (400, 500)  validation1 y: (400, 1)
train2 X: (3293, 500)  train2 y: (3293, 1)  Test2 X: (730, 500)  Test2 y: (730, 1)
validation2 X: (360, 500)  validation2 y: (360, 1)


In [4]:
train_size = train1_X.shape[0]
corrs = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs[i] = 0
    else:
        corrs[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs.shape,X.columns.shape,X.columns[0],X.columns[499]

((500, 1), (500,), ('flu',), ('symptoms of kidney infection',))

In [5]:
def corr_filter(df_X,df_Xval,df_Xtest,corrs,threshold):
    X = df_X.copy()
    X_test = df_Xtest.copy()
    X_val = df_Xval.copy()
    col_corr = set() # Set of all the names of deleted columns
    for i in range(0,500):
            if corrs[i,0] < threshold: 
                #print(X.columns[i])
                colname = df_X.columns[i]
                del X[colname] # deleting the column from the dataset
                del X_test[colname] 
                del X_val[colname]

    return X,X_val,X_test

## Fixing pearson correlation filter r>=0.2

In [6]:
#train1_X0,test1_X0 = corr_filter(train1_X,test1_X,corrs,0.1)
train1_X1,val1_X1,test1_X1 = corr_filter(train1_X,val1_X,test1_X,corrs,0.2)
#train1_X2,test1_X2 = corr_filter(train1_X,test1_X,corrs,0.3)
#train1_X3,test1_X3 = corr_filter(train1_X,test1_X,corrs,0.4)

In [7]:
#print('For correlation filter r>=0.1, we select feature number: ',train1_X0.shape[1])
print('For correlation filter r>=0.2, we select feature number: ',train1_X1.shape[1])
#print('For correlation filter r>=0.3, we select feature number: ',train1_X2.shape[1])
#print('For correlation filter r>=0.4, we select feature number: ',train1_X3.shape[1])

For correlation filter r>=0.2, we select feature number:  215


In [8]:
# Defien the mearure matrics, MAE, RMSE, CORR
# define three metrics: mean absolute error, root mean squared error and Pearson's correlation.
from sklearn.metrics import mean_absolute_error
# mae = mean_absolute_error(y_actual, y_pred)

from sklearn.metrics import mean_squared_error
from math import sqrt
# rmse = sqrt(mean_squared_error(y_actual, y_pred))

# np.correcoef returns Pearson product-moment correlation coefficients
def pearson_r(x,y):   
    corr_mat = np.corrcoef(x,y)
    return corr_mat[0,1]
# r = pearson_r(y_actual,y_pred)


# Generalise the function for convinient tuning
def ridge(a,train_X,train_y,test_X,test_y):
    # scaling and modeling
    scalerX = StandardScaler()
    scalerX.fit(train_X)
    train_X = scalerX.transform(train_X)
    test_X = scalerX.transform(test_X)
    
    scalery = StandardScaler() 
    scalery.fit(train_y) 
    train_y = scalery.transform(train_y) 
    
    
    alpha=a
    ri = Ridge(alpha, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
  
    ri.fit(train_X,train_y)

    y_pred1 = ri.predict(test_X)
    y_pred1 = scalery.inverse_transform(y_pred1)

    mae1 = mean_absolute_error(test_y, y_pred1)
    #print('The mean absolute error is: ',mae1)

    rmse1 = sqrt(mean_squared_error(test_y, y_pred1))
    #print('The root mean squared error is: ',rmse1)
    
    corr_y = test_y.copy()
    corr_y['y_act'] = test_y
    corr_y['y_pred']= y_pred1
    corr = np.corrcoef(corr_y['y_act'],corr_y['y_pred'])[0,1]
    #print('The correlation is: ',corr)
    
    return rmse1, mae1,corr
    
    


# scaling and modeling
scaler = MinMaxScaler()
train1_X0_scaled = scaler.fit_transform(train1_X0)
test1_X0_scaled = scaler.transform(test1_x0)

train1_X1_scaled = scaler.fit_transform(train1_X1)
test1_X1_scaled = scaler.transform(test1_x1)

train1_X2_scaled = scaler.fit_transform(train1_X2)
test1_X2_scaled = scaler.transform(test1_x2)

train1_X3_scaled = scaler.fit_transform(train1_X3)
test1_X3_scaled = scaler.transform(test1_x3)


para = [10]
para_l=[0.3]
print('For correlation filter r>=0.1, 310 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X0,train1_y,test1_X0,test1_y))

In [16]:

para = np.arange(0, 10000000, 10000)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')
        

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  23.84991227284693
The root mean squared error is:  49.25516226820323
The correlation is:  0.7491690821004
----------------------------------------
alpha: 10000
Best RMSE is updated! 
The mean absolute error is:  18.72112640895357
The root mean squared error is:  36.07439467584889
The correlation is:  0.8079109470589398
----------------------------------------
alpha: 20000
Best RMSE is updated! 
The mean absolute error is:  16.403153649263285
The root mean squared error is:  31.13621656074955
The correlation is:  0.8007177669253068
----------------------------------------
alpha: 30000
Best RMSE is updated! 
The mean absolute error is:  14.79540135344137
The root mean squared error is:  27.951852784150542
The correlation is:  0.7956867916535015
----------------------------------------
alpha: 40000
Best RMSE is updated! 
The mean absolute error is:  13.608370939392104
The root mean squared error is:  25.648474764894363
The corre

alpha: 1970000
alpha: 1980000
alpha: 1990000
alpha: 2000000
alpha: 2010000
alpha: 2020000
alpha: 2030000
alpha: 2040000
alpha: 2050000
alpha: 2060000
alpha: 2070000
alpha: 2080000
alpha: 2090000
alpha: 2100000
alpha: 2110000
alpha: 2120000
alpha: 2130000
alpha: 2140000
alpha: 2150000
alpha: 2160000
alpha: 2170000
alpha: 2180000
alpha: 2190000
alpha: 2200000
alpha: 2210000
alpha: 2220000
alpha: 2230000
alpha: 2240000
alpha: 2250000
alpha: 2260000
alpha: 2270000
alpha: 2280000
alpha: 2290000
alpha: 2300000
alpha: 2310000
alpha: 2320000
alpha: 2330000
alpha: 2340000
alpha: 2350000
alpha: 2360000
alpha: 2370000
alpha: 2380000
alpha: 2390000
alpha: 2400000
alpha: 2410000
alpha: 2420000
alpha: 2430000
alpha: 2440000
alpha: 2450000
alpha: 2460000
alpha: 2470000
alpha: 2480000
alpha: 2490000
alpha: 2500000
alpha: 2510000
alpha: 2520000
alpha: 2530000
alpha: 2540000
alpha: 2550000
alpha: 2560000
alpha: 2570000
alpha: 2580000
alpha: 2590000
alpha: 2600000
alpha: 2610000
alpha: 2620000
alpha: 263

alpha: 7450000
alpha: 7460000
alpha: 7470000
alpha: 7480000
alpha: 7490000
alpha: 7500000
alpha: 7510000
alpha: 7520000
alpha: 7530000
alpha: 7540000
alpha: 7550000
alpha: 7560000
alpha: 7570000
alpha: 7580000
alpha: 7590000
alpha: 7600000
alpha: 7610000
alpha: 7620000
alpha: 7630000
alpha: 7640000
alpha: 7650000
alpha: 7660000
alpha: 7670000
alpha: 7680000
alpha: 7690000
alpha: 7700000
alpha: 7710000
alpha: 7720000
alpha: 7730000
alpha: 7740000
alpha: 7750000
alpha: 7760000
alpha: 7770000
alpha: 7780000
alpha: 7790000
alpha: 7800000
alpha: 7810000
alpha: 7820000
alpha: 7830000
alpha: 7840000
alpha: 7850000
alpha: 7860000
alpha: 7870000
alpha: 7880000
alpha: 7890000
alpha: 7900000
alpha: 7910000
alpha: 7920000
alpha: 7930000
alpha: 7940000
alpha: 7950000
alpha: 7960000
alpha: 7970000
alpha: 7980000
alpha: 7990000
alpha: 8000000
alpha: 8010000
alpha: 8020000
alpha: 8030000
alpha: 8040000
alpha: 8050000
alpha: 8060000
alpha: 8070000
alpha: 8080000
alpha: 8090000
alpha: 8100000
alpha: 811

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 150 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X2,train1_y,test1_X2,test1_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 103 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X3,train1_y,test1_X3,test1_y))

In [17]:
para =[0,210000]
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  23.84991227284693
The root mean squared error is:  49.25516226820323
The correlation is:  0.7491690821004
----------------------------------------
alpha: 210000
Best RMSE is updated! 
The mean absolute error is:  9.409335178983515
The root mean squared error is:  16.297997240292997
The correlation is:  0.7748210334059543
----------------------------------------


## For the last 2 years as the testing set

In [18]:
train_size = 3653
corrs2 = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs2[i] = 0
    else:
        corrs2[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs2.shape,X.columns.shape,X.columns[0],X.columns[499]

((500, 1), (500,), ('flu',), ('symptoms of kidney infection',))

In [19]:
#train2_X0,test2_X0 = corr_filter(train2_X,test2_X,corrs2,0.1)
train2_X1,val2_X1,test2_X1 = corr_filter(train2_X,val2_X,test2_X,corrs2,0.2)
#train2_X2,test2_X2 = corr_filter(train2_X,test2_X,corrs2,0.3)
#train2_X3,test2_X3 = corr_filter(train2_X,test2_X,corrs2,0.4)

In [20]:
#print('For correlation filter r>0.1, we select feature number: ',train2_X0.shape[1])
print('For correlation filter r>0.2, we select feature number: ',train2_X1.shape[1])
#print('For correlation filter r>0.3, we select feature number: ',train2_X2.shape[1])
#print('For correlation filter r>0.4, we select feature number: ',train2_X3.shape[1])

For correlation filter r>0.2, we select feature number:  215


# scaling and modeling
scaler = MinMaxScaler()
train2_X0_scaled = scaler.fit_transform(train2_X0)
test2_X0_scaled = scaler.transform(test2_X0)

train2_X1_scaled = scaler.fit_transform(train2_X1)
test2_X1_scaled = scaler.transform(test2_X1)

train2_X2_scaled = scaler.fit_transform(train2_X2)
test2_X2_scaled = scaler.transform(test2_X2)

train2_X3_scaled = scaler.fit_transform(train2_X3)
test2_X3_scaled = scaler.transform(test2_X3)


para = range(1,100)
para = np.arange(0.01, 10.0, 0.01)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

In [21]:

para =np.arange(0, 10000000, 10000)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train2_X1,train2_y,val2_X1,val2_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  5.701643382564393
The root mean squared error is:  14.41849934736827
The correlation is:  0.2993203937806601
----------------------------------------
alpha: 10000
alpha: 20000
alpha: 30000
alpha: 40000
alpha: 50000
alpha: 60000
alpha: 70000
alpha: 80000
alpha: 90000
alpha: 100000
alpha: 110000
alpha: 120000
alpha: 130000
Best RMSE is updated! 
The mean absolute error is:  7.625116530231743
The root mean squared error is:  14.243350836654377
The correlation is:  0.1212054653660179
----------------------------------------
alpha: 140000
Best RMSE is updated! 
The mean absolute error is:  7.593353912384872
The root mean squared error is:  13.9927409462112
The correlation is:  0.12038414279285284
----------------------------------------
alpha: 150000
Best RMSE is updated! 
The mean absolute error is:  7.563390233066975
The root mean squared error is:  13.755110951543923
The correlation is:  0.11966121056570772
--------------------

Best RMSE is updated! 
The mean absolute error is:  7.072034206785298
The root mean squared error is:  10.197952987589943
The correlation is:  0.11244443486674398
----------------------------------------
alpha: 450000
Best RMSE is updated! 
The mean absolute error is:  7.061994388625529
The root mean squared error is:  10.138898808073161
The correlation is:  0.11235511366439045
----------------------------------------
alpha: 460000
Best RMSE is updated! 
The mean absolute error is:  7.052215118358524
The root mean squared error is:  10.082087974693374
The correlation is:  0.11226952640971657
----------------------------------------
alpha: 470000
Best RMSE is updated! 
The mean absolute error is:  7.04268702993729
The root mean squared error is:  10.027412899626453
The correlation is:  0.11218744364063205
----------------------------------------
alpha: 480000
Best RMSE is updated! 
The mean absolute error is:  7.033538563751634
The root mean squared error is:  9.974772191139161
The corr

Best RMSE is updated! 
The mean absolute error is:  6.8140664549850944
The root mean squared error is:  8.885130740465586
The correlation is:  0.11046884662954613
----------------------------------------
alpha: 860000
Best RMSE is updated! 
The mean absolute error is:  6.810176446419958
The root mean squared error is:  8.869388079625232
The correlation is:  0.11044369390025022
----------------------------------------
alpha: 870000
Best RMSE is updated! 
The mean absolute error is:  6.8063541565991725
The root mean squared error is:  8.854063231549668
The correlation is:  0.11041910716609002
----------------------------------------
alpha: 880000
Best RMSE is updated! 
The mean absolute error is:  6.80259785302942
The root mean squared error is:  8.839142143477076
The correlation is:  0.1103950675323745
----------------------------------------
alpha: 890000
Best RMSE is updated! 
The mean absolute error is:  6.798905860895179
The root mean squared error is:  8.824611334405022
The correla

Best RMSE is updated! 
The mean absolute error is:  6.700844913803068
The root mean squared error is:  8.487435332898315
The correlation is:  0.10978400724332715
----------------------------------------
alpha: 1250000
Best RMSE is updated! 
The mean absolute error is:  6.69873089720984
The root mean squared error is:  8.481190781334199
The correlation is:  0.10977198283517837
----------------------------------------
alpha: 1260000
Best RMSE is updated! 
The mean absolute error is:  6.696644503026217
The root mean squared error is:  8.475075345389698
The correlation is:  0.10976014646578833
----------------------------------------
alpha: 1270000
Best RMSE is updated! 
The mean absolute error is:  6.69458519597991
The root mean squared error is:  8.469085652942177
The correlation is:  0.10974849375819482
----------------------------------------
alpha: 1280000
Best RMSE is updated! 
The mean absolute error is:  6.692552454459407
The root mean squared error is:  8.463218438684795
The corre

Best RMSE is updated! 
The mean absolute error is:  6.6335575793082295
The root mean squared error is:  8.312721211042323
The correlation is:  0.10941610280735288
----------------------------------------
alpha: 1650000
Best RMSE is updated! 
The mean absolute error is:  6.6322350273306885
The root mean squared error is:  8.309790084042845
The correlation is:  0.10940916467890373
----------------------------------------
alpha: 1660000
Best RMSE is updated! 
The mean absolute error is:  6.630926206611934
The root mean squared error is:  8.306908676631494
The correlation is:  0.10940230919638774
----------------------------------------
alpha: 1670000
Best RMSE is updated! 
The mean absolute error is:  6.6296309049518465
The root mean squared error is:  8.304075932292509
The correlation is:  0.10939553489178369
----------------------------------------
alpha: 1680000
Best RMSE is updated! 
The mean absolute error is:  6.628348914487525
The root mean squared error is:  8.301290821734693
The 

Best RMSE is updated! 
The mean absolute error is:  6.597324640977748
The root mean squared error is:  8.239181303681267
The correlation is:  0.10922886077405283
----------------------------------------
alpha: 1970000
Best RMSE is updated! 
The mean absolute error is:  6.596402961492602
The root mean squared error is:  8.237427445323847
The correlation is:  0.10922398025479543
----------------------------------------
alpha: 1980000
Best RMSE is updated! 
The mean absolute error is:  6.595489491955864
The root mean squared error is:  8.23569975298561
The correlation is:  0.10921914856423125
----------------------------------------
alpha: 1990000
Best RMSE is updated! 
The mean absolute error is:  6.5945841233502325
The root mean squared error is:  8.233997742157854
The correlation is:  0.10921436497320174
----------------------------------------
alpha: 2000000
Best RMSE is updated! 
The mean absolute error is:  6.593686748576222
The root mean squared error is:  8.232320939193695
The cor

Best RMSE is updated! 
The mean absolute error is:  6.571542912832708
The root mean squared error is:  8.193972904827085
The correlation is:  0.10909374278653178
----------------------------------------
alpha: 2290000
Best RMSE is updated! 
The mean absolute error is:  6.57084965962321
The root mean squared error is:  8.192861130236778
The correlation is:  0.10909012380167873
----------------------------------------
alpha: 2300000
Best RMSE is updated! 
The mean absolute error is:  6.570161813207867
The root mean squared error is:  8.191764245401092
The correlation is:  0.1090865360275611
----------------------------------------
alpha: 2310000
Best RMSE is updated! 
The mean absolute error is:  6.569479310660914
The root mean squared error is:  8.19068200561153
The correlation is:  0.109082979062153
----------------------------------------
alpha: 2320000
Best RMSE is updated! 
The mean absolute error is:  6.568802090027889
The root mean squared error is:  8.189614171016242
The correlat

Best RMSE is updated! 
The mean absolute error is:  6.548007789308981
The root mean squared error is:  8.159753493954453
The correlation is:  0.10897254731032477
----------------------------------------
alpha: 2680000
Best RMSE is updated! 
The mean absolute error is:  6.547486330085524
The root mean squared error is:  8.15907764513247
The correlation is:  0.10896990027265727
----------------------------------------
alpha: 2690000
Best RMSE is updated! 
The mean absolute error is:  6.546968402482078
The root mean squared error is:  8.158409911080184
The correlation is:  0.10896727277689816
----------------------------------------
alpha: 2700000
Best RMSE is updated! 
The mean absolute error is:  6.54645397077519
The root mean squared error is:  8.157750174200412
The correlation is:  0.10896466460743601
----------------------------------------
alpha: 2710000
Best RMSE is updated! 
The mean absolute error is:  6.545942999721117
The root mean squared error is:  8.157098318945483
The corre

Best RMSE is updated! 
The mean absolute error is:  6.529996644777394
The root mean squared error is:  8.13848099152406
The correlation is:  0.10888205330977582
----------------------------------------
alpha: 3070000
Best RMSE is updated! 
The mean absolute error is:  6.529590253719086
The root mean squared error is:  8.138050252117115
The correlation is:  0.10888003340840897
----------------------------------------
alpha: 3080000
Best RMSE is updated! 
The mean absolute error is:  6.52918629457351
The root mean squared error is:  8.137624244732084
The correlation is:  0.10887802654242248
----------------------------------------
alpha: 3090000
Best RMSE is updated! 
The mean absolute error is:  6.528784745591161
The root mean squared error is:  8.137202907854737
The correlation is:  0.1088760325860351
----------------------------------------
alpha: 3100000
Best RMSE is updated! 
The mean absolute error is:  6.528385585280923
The root mean squared error is:  8.136786180927674
The correl

Best RMSE is updated! 
The mean absolute error is:  6.513527020212313
The root mean squared error is:  8.122766066198087
The correlation is:  0.10880095210040372
----------------------------------------
alpha: 3530000
Best RMSE is updated! 
The mean absolute error is:  6.513213366152538
The root mean squared error is:  8.122501440025655
The correlation is:  0.10879942250923225
----------------------------------------
alpha: 3540000
Best RMSE is updated! 
The mean absolute error is:  6.512901362325295
The root mean squared error is:  8.122239491247019
The correlation is:  0.10879790151338871
----------------------------------------
alpha: 3550000
Best RMSE is updated! 
The mean absolute error is:  6.512590995747739
The root mean squared error is:  8.121980188698123
The correlation is:  0.10879638904062482
----------------------------------------
alpha: 3560000
Best RMSE is updated! 
The mean absolute error is:  6.512282253572777
The root mean squared error is:  8.121723501645864
The cor

Best RMSE is updated! 
The mean absolute error is:  6.501894533459392
The root mean squared error is:  8.113818592093374
The correlation is:  0.10874459097568137
----------------------------------------
alpha: 3940000
Best RMSE is updated! 
The mean absolute error is:  6.501639179661793
The root mean squared error is:  8.113642158603444
The correlation is:  0.10874336214359386
----------------------------------------
alpha: 3950000
Best RMSE is updated! 
The mean absolute error is:  6.501385038723886
The root mean squared error is:  8.113467415176089
The correlation is:  0.10874213950344752
----------------------------------------
alpha: 3960000
Best RMSE is updated! 
The mean absolute error is:  6.501132102028476
The root mean squared error is:  8.113294343746055
The correlation is:  0.10874092300855893
----------------------------------------
alpha: 3970000
Best RMSE is updated! 
The mean absolute error is:  6.500880361039771
The root mean squared error is:  8.11312292647582
The corr

Best RMSE is updated! 
The mean absolute error is:  6.49340882991559
The root mean squared error is:  8.108414993248568
The correlation is:  0.10870394615004818
----------------------------------------
alpha: 4300000
Best RMSE is updated! 
The mean absolute error is:  6.4931922396904005
The root mean squared error is:  8.10828946895625
The correlation is:  0.10870291384511685
----------------------------------------
alpha: 4310000
Best RMSE is updated! 
The mean absolute error is:  6.492976597266073
The root mean squared error is:  8.108165106893589
The correlation is:  0.10870188630928317
----------------------------------------
alpha: 4320000
Best RMSE is updated! 
The mean absolute error is:  6.492761896436883
The root mean squared error is:  8.108041895431086
The correlation is:  0.10870086350957395
----------------------------------------
alpha: 4330000
Best RMSE is updated! 
The mean absolute error is:  6.492548131051138
The root mean squared error is:  8.107919823075475
The corr

Best RMSE is updated! 
The mean absolute error is:  6.485062441835881
The root mean squared error is:  8.104023953954755
The correlation is:  0.10866434822404039
----------------------------------------
alpha: 4720000
Best RMSE is updated! 
The mean absolute error is:  6.484880876742664
The root mean squared error is:  8.103938608603091
The correlation is:  0.10866349096064896
----------------------------------------
alpha: 4730000
Best RMSE is updated! 
The mean absolute error is:  6.484700039366666
The root mean squared error is:  8.103854035875674
The correlation is:  0.10866263730743381
----------------------------------------
alpha: 4740000
Best RMSE is updated! 
The mean absolute error is:  6.484519925342765
The root mean squared error is:  8.103770228548996
The correlation is:  0.10866178724163786
----------------------------------------
alpha: 4750000
Best RMSE is updated! 
The mean absolute error is:  6.48434053034068
The root mean squared error is:  8.10368717947785
The corre

Best RMSE is updated! 
The mean absolute error is:  6.47864013721227
The root mean squared error is:  8.10126836995122
The correlation is:  0.10863413133940374
----------------------------------------
alpha: 5100000
Best RMSE is updated! 
The mean absolute error is:  6.478483426876063
The root mean squared error is:  8.101207900598164
The correlation is:  0.10863339673868501
----------------------------------------
alpha: 5110000
Best RMSE is updated! 
The mean absolute error is:  6.4783273002236
The root mean squared error is:  8.10114797705763
The correlation is:  0.10863266500237816
----------------------------------------
alpha: 5120000
Best RMSE is updated! 
The mean absolute error is:  6.478171754000717
The root mean squared error is:  8.101088594493906
The correlation is:  0.108631936113762
----------------------------------------
alpha: 5130000
Best RMSE is updated! 
The mean absolute error is:  6.478016784977384
The root mean squared error is:  8.101029748120453
The correlatio

Best RMSE is updated! 
The mean absolute error is:  6.473069698159886
The root mean squared error is:  8.099316806809998
The correlation is:  0.1086080981280559
----------------------------------------
alpha: 5480000
Best RMSE is updated! 
The mean absolute error is:  6.472933068440433
The root mean squared error is:  8.099274054640116
The correlation is:  0.10860746162969474
----------------------------------------
alpha: 5490000
Best RMSE is updated! 
The mean absolute error is:  6.472796914000072
The root mean squared error is:  8.099231694742244
The correlation is:  0.10860682744202456
----------------------------------------
alpha: 5500000
Best RMSE is updated! 
The mean absolute error is:  6.47266123236365
The root mean squared error is:  8.099189723800873
The correlation is:  0.10860619555248502
----------------------------------------
alpha: 5510000
Best RMSE is updated! 
The mean absolute error is:  6.4725260210731665
The root mean squared error is:  8.099148138532307
The corr

Best RMSE is updated! 
The mean absolute error is:  6.467241681462727
The root mean squared error is:  8.09771062674467
The correlation is:  0.10858103388756904
----------------------------------------
alpha: 5940000
Best RMSE is updated! 
The mean absolute error is:  6.467124590930823
The root mean squared error is:  8.097682918603587
The correlation is:  0.10858049193798923
----------------------------------------
alpha: 5950000
Best RMSE is updated! 
The mean absolute error is:  6.467007877550905
The root mean squared error is:  8.097655478455497
The correlation is:  0.10857995180423968
----------------------------------------
alpha: 5960000
Best RMSE is updated! 
The mean absolute error is:  6.466891539503982
The root mean squared error is:  8.097628304142644
The correlation is:  0.10857941347720902
----------------------------------------
alpha: 5970000
Best RMSE is updated! 
The mean absolute error is:  6.466775574982746
The root mean squared error is:  8.097601393526697
The corr

Best RMSE is updated! 
The mean absolute error is:  6.462321808890645
The root mean squared error is:  8.096701121225408
The correlation is:  0.1085583230310341
----------------------------------------
alpha: 6390000
Best RMSE is updated! 
The mean absolute error is:  6.4622200421732385
The root mean squared error is:  8.096683584932112
The correlation is:  0.10855785456772181
----------------------------------------
alpha: 6400000
Best RMSE is updated! 
The mean absolute error is:  6.46211858110828
The root mean squared error is:  8.096666236230847
The correlation is:  0.10855738756398545
----------------------------------------
alpha: 6410000
Best RMSE is updated! 
The mean absolute error is:  6.462017424321005
The root mean squared error is:  8.09664907366868
The correlation is:  0.10855692201301394
----------------------------------------
alpha: 6420000
Best RMSE is updated! 
The mean absolute error is:  6.4619165704448935
The root mean squared error is:  8.096632095805028
The corr

Best RMSE is updated! 
The mean absolute error is:  6.458205838068299
The root mean squared error is:  8.096099942625417
The correlation is:  0.10853941797728393
----------------------------------------
alpha: 6820000
Best RMSE is updated! 
The mean absolute error is:  6.458116067484492
The root mean squared error is:  8.096089299669902
The correlation is:  0.10853900661002362
----------------------------------------
alpha: 6830000
Best RMSE is updated! 
The mean absolute error is:  6.4580265501748855
The root mean squared error is:  8.096078791658126
The correlation is:  0.10853859644398012
----------------------------------------
alpha: 6840000
Best RMSE is updated! 
The mean absolute error is:  6.457937285069262
The root mean squared error is:  8.096068417574795
The correlation is:  0.10853818747390002
----------------------------------------
alpha: 6850000
Best RMSE is updated! 
The mean absolute error is:  6.4578482711034235
The root mean squared error is:  8.096058176412846
The c

Best RMSE is updated! 
The mean absolute error is:  6.4544025649910015
The root mean squared error is:  8.095741335871567
The correlation is:  0.10852202540577888
----------------------------------------
alpha: 7270000
Best RMSE is updated! 
The mean absolute error is:  6.454323211770752
The root mean squared error is:  8.095735866571621
The correlation is:  0.10852166329524669
----------------------------------------
alpha: 7280000
Best RMSE is updated! 
The mean absolute error is:  6.454244069074348
The root mean squared error is:  8.095730493703778
The correlation is:  0.1085213021769099
----------------------------------------
alpha: 7290000
Best RMSE is updated! 
The mean absolute error is:  6.454165136065217
The root mean squared error is:  8.095725216557986
The correlation is:  0.10852094204669614
----------------------------------------
alpha: 7300000
Best RMSE is updated! 
The mean absolute error is:  6.454090673569089
The root mean squared error is:  8.095720034429682
The cor

Best RMSE is updated! 
The mean absolute error is:  6.452090619009234
The root mean squared error is:  8.09561081499652
The correlation is:  0.10851091056935684
----------------------------------------
alpha: 7590000
Best RMSE is updated! 
The mean absolute error is:  6.45202182739956
The root mean squared error is:  8.095608104947464
The correlation is:  0.10851057829433323
----------------------------------------
alpha: 7600000
Best RMSE is updated! 
The mean absolute error is:  6.45195321083792
The root mean squared error is:  8.09560547111153
The correlation is:  0.10851024689151662
----------------------------------------
alpha: 7610000
Best RMSE is updated! 
The mean absolute error is:  6.4518847686570915
The root mean squared error is:  8.095602912932852
The correlation is:  0.10850991635747677
----------------------------------------
alpha: 7620000
Best RMSE is updated! 
The mean absolute error is:  6.451816500193236
The root mean squared error is:  8.09560042985974
The correla

Best RMSE is updated! 
The mean absolute error is:  6.449406917350668
The root mean squared error is:  8.09555666994409
The correlation is:  0.1084979676763432
----------------------------------------
alpha: 8000000
alpha: 8010000
alpha: 8020000
alpha: 8030000
alpha: 8040000
alpha: 8050000
alpha: 8060000
alpha: 8070000
alpha: 8080000
alpha: 8090000
alpha: 8100000
alpha: 8110000
alpha: 8120000
alpha: 8130000
alpha: 8140000
alpha: 8150000
alpha: 8160000
alpha: 8170000
alpha: 8180000
alpha: 8190000
alpha: 8200000
alpha: 8210000
alpha: 8220000
alpha: 8230000
alpha: 8240000
alpha: 8250000
alpha: 8260000
alpha: 8270000
alpha: 8280000
alpha: 8290000
alpha: 8300000
alpha: 8310000
alpha: 8320000
alpha: 8330000
alpha: 8340000
alpha: 8350000
alpha: 8360000
alpha: 8370000
alpha: 8380000
alpha: 8390000
alpha: 8400000
alpha: 8410000
alpha: 8420000
alpha: 8430000
alpha: 8440000
alpha: 8450000
alpha: 8460000
alpha: 8470000
alpha: 8480000
alpha: 8490000
alpha: 8500000
alpha: 8510000
alpha: 8520000
alph

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 154 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X2,train2_y,test2_X2,test2_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 105 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X3,train2_y,test2_X3,test2_y))

In [22]:
## tuned alpha based on validation set, then measure the performance for test set.
## Fixing l1-ratio = 0.3
para = [0,7990000]
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  23.84991227284693
The root mean squared error is:  49.25516226820323
The correlation is:  0.7491690821004
----------------------------------------
alpha: 7990000
Best RMSE is updated! 
The mean absolute error is:  13.983334575979702
The root mean squared error is:  27.37858452479817
The correlation is:  0.7673283115474481
----------------------------------------
