## tasks
### 1: Form training and testing periods as discussed (2 test periods, last 2*365 days). Be careful tp scale data only based on data from the training set. You may use (from Python) MinMaxScaler, standardisation (StandardScaler) or the default scaler in Python's elastic net.
### 2:Estimate the current flu rates by training an elastic net model. Use a Pearson correlation filter (r > 0.3) on the training data to reduce the amount of queries prior to training an elastic net (reminder: not all 1000 queries I provided are related to flu!). Report performance on the two test sets using three metrics: mean absolute error, root mean squared error and Pearson's correlation.
### 3: If there is time, begin work on traditional forecasting models (you've identified seasonal ARIMA and Hult-Winters).

In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import csv
import scipy.stats as stats
import seaborn as sns
from collections import Counter
from collections import defaultdict
import math
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from random import randint
from sklearn.linear_model import Ridge
from sklearn import linear_model

import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm

# initialize the plotting sizes
# set size
plt.rc('figure', figsize=(10, 5))
# subplots size
fizsize_with_subplots = (10, 10)
# histogram size
bin_size = 10

In [2]:
# Loading data
dates = pd.read_csv('data/dates.csv',header=None)
queries = pd.read_csv('data/queries.csv',header=None)
X = pd.read_csv('data/X.csv',header=None)
y = pd.read_csv('data/y.csv',header=None)

X.columns = queries

In [3]:
# data preprocessing and train-test split
# this data is well formed with no missing value and other symbols or labels that are non numerical.
# splitting the data into train and test

# Here we want the first 500 queries
X = X.iloc[:, 0: 250]

# first with the last year as test
# for convinience, pick the two validation set starting from 1/3 and 2/3 of the training set,
# 10% of training set as validation, here use 400 data points, two 200 periods.
test_size =365
length = X.shape[0]
l =(X.shape[0]-test_size)//3

train1_X = pd.concat([X[0:l],X[l+200:2*l],X[2*l+200:length-test_size]])
val1_X = pd.concat([X[l:l+200],X[2*l:2*l+200]])
train1_y = pd.concat([y[0:l],y[l+200:2*l],y[2*l+200:length-test_size]])
val1_y = pd.concat([y[l:l+200],y[2*l:2*l+200]])
test1_X = X[length-test_size:]
test1_y = y[length-test_size:]

# second with the last 2 year as test
test_size2 =365*2
l2 =(X.shape[0]-test_size2)//3

train2_X = pd.concat([X[0:l2],X[l2+180:2*l2],X[2*l2+180:length-test_size2]])
val2_X = pd.concat([X[l2:l2+180],X[2*l2:2*l2+180]])
train2_y = pd.concat([y[0:l2],y[l2+180:2*l2],y[2*l2+180:length-test_size2]])
val2_y = pd.concat([y[l2:l2+180],y[2*l2:2*l2+180]])
test2_X = X[length-test_size2:length-test_size]
test2_y = y[length-test_size2:length-test_size]

print('X shape: ',X.shape,'  y shape: ',y.shape)
print('train1 X:',train1_X.shape,' train1 y:',train1_y.shape,' Test1 X shape:',test1_X.shape, ' Test1 y shape:',test1_y.shape)
print('validation1 X:',val1_X.shape,' validation1 y:',val1_y.shape)
print('train2 X:',train2_X.shape,' train2 y:',train2_y.shape,' Test2 X:',test2_X.shape,' Test2 y:',test2_y.shape)
print('validation2 X:',val2_X.shape,' validation2 y:',val2_y.shape)

X shape:  (4383, 250)   y shape:  (4383, 1)
train1 X: (3618, 250)  train1 y: (3618, 1)  Test1 X shape: (365, 250)  Test1 y shape: (365, 1)
validation1 X: (400, 250)  validation1 y: (400, 1)
train2 X: (3293, 250)  train2 y: (3293, 1)  Test2 X: (730, 250)  Test2 y: (730, 1)
validation2 X: (360, 250)  validation2 y: (360, 1)


In [4]:
train_size = train1_X.shape[0]
corrs = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs[i] = 0
    else:
        corrs[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [5]:
def corr_filter(df_X,df_Xval,df_Xtest,corrs,threshold):
    X = df_X.copy()
    X_test = df_Xtest.copy()
    X_val = df_Xval.copy()
    col_corr = set() # Set of all the names of deleted columns
    for i in range(0,250):
            if corrs[i,0] < threshold: 
                #print(X.columns[i])
                colname = df_X.columns[i]
                del X[colname] # deleting the column from the dataset
                del X_test[colname] 
                del X_val[colname]

    return X,X_val,X_test

## Fixing pearson correlation filter r>=0.2

In [6]:
#train1_X0,test1_X0 = corr_filter(train1_X,test1_X,corrs,0.1)
train1_X1,val1_X1,test1_X1 = corr_filter(train1_X,val1_X,test1_X,corrs,0.2)
#train1_X2,test1_X2 = corr_filter(train1_X,test1_X,corrs,0.3)
#train1_X3,test1_X3 = corr_filter(train1_X,test1_X,corrs,0.4)

In [7]:
#print('For correlation filter r>=0.1, we select feature number: ',train1_X0.shape[1])
print('For correlation filter r>=0.2, we select feature number: ',train1_X1.shape[1])
#print('For correlation filter r>=0.3, we select feature number: ',train1_X2.shape[1])
#print('For correlation filter r>=0.4, we select feature number: ',train1_X3.shape[1])

For correlation filter r>=0.2, we select feature number:  157


In [8]:
# Defien the mearure matrics, MAE, RMSE, CORR
# define three metrics: mean absolute error, root mean squared error and Pearson's correlation.
from sklearn.metrics import mean_absolute_error
# mae = mean_absolute_error(y_actual, y_pred)

from sklearn.metrics import mean_squared_error
from math import sqrt
# rmse = sqrt(mean_squared_error(y_actual, y_pred))

# np.correcoef returns Pearson product-moment correlation coefficients
def pearson_r(x,y):   
    corr_mat = np.corrcoef(x,y)
    return corr_mat[0,1]
# r = pearson_r(y_actual,y_pred)


# Generalise the function for convinient tuning
def ridge(a,train_X,train_y,test_X,test_y):
    alpha=a
    ri = Ridge(alpha, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
  
    ri.fit(train_X,train_y)

    y_pred1 = ri.predict(test_X)

    mae1 = mean_absolute_error(test_y, y_pred1)
    #print('The mean absolute error is: ',mae1)

    rmse1 = sqrt(mean_squared_error(test_y, y_pred1))
    #print('The root mean squared error is: ',rmse1)
    
    corr_y = test_y.copy()
    corr_y['y_act'] = test_y
    corr_y['y_pred']= y_pred1
    corr = np.corrcoef(corr_y['y_act'],corr_y['y_pred'])[0,1]
    #print('The correlation is: ',corr)
    
    return rmse1, mae1,corr
    
    


# scaling and modeling
scaler = MinMaxScaler()
train1_X0_scaled = scaler.fit_transform(train1_X0)
test1_X0_scaled = scaler.transform(test1_x0)

train1_X1_scaled = scaler.fit_transform(train1_X1)
test1_X1_scaled = scaler.transform(test1_x1)

train1_X2_scaled = scaler.fit_transform(train1_X2)
test1_X2_scaled = scaler.transform(test1_x2)

train1_X3_scaled = scaler.fit_transform(train1_X3)
test1_X3_scaled = scaler.transform(test1_x3)


para = [10]
para_l=[0.3]
print('For correlation filter r>=0.1, 310 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X0,train1_y,test1_X0,test1_y))

In [9]:

para = np.arange(0.01, 100000000, 100000)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')
        

alpha: 0.01
Best RMSE is updated! 
The mean absolute error is:  22.738477450973598
The root mean squared error is:  45.958052947675554
The correlation is:  0.7576646305034048
----------------------------------------
alpha: 100000.01
Best RMSE is updated! 
The mean absolute error is:  22.835767393873052
The root mean squared error is:  45.7247377041174
The correlation is:  0.7828578465778442
----------------------------------------
alpha: 200000.01
Best RMSE is updated! 
The mean absolute error is:  22.740940801233837
The root mean squared error is:  45.112747716741495
The correlation is:  0.7995459431814701
----------------------------------------
alpha: 300000.01
Best RMSE is updated! 
The mean absolute error is:  22.660687654051323
The root mean squared error is:  44.62687212114223
The correlation is:  0.8116760346728115
----------------------------------------
alpha: 400000.01
Best RMSE is updated! 
The mean absolute error is:  22.596325688054016
The root mean squared error is:  44.

Best RMSE is updated! 
The mean absolute error is:  21.579649949243393
The root mean squared error is:  40.10752769271444
The correlation is:  0.8851124628468852
----------------------------------------
alpha: 4500000.01
Best RMSE is updated! 
The mean absolute error is:  21.561791683253094
The root mean squared error is:  40.05297862823588
The correlation is:  0.8854444187297867
----------------------------------------
alpha: 4600000.01
Best RMSE is updated! 
The mean absolute error is:  21.544188018855298
The root mean squared error is:  39.99937444685101
The correlation is:  0.8857631595438921
----------------------------------------
alpha: 4700000.01
Best RMSE is updated! 
The mean absolute error is:  21.526837039994813
The root mean squared error is:  39.946692868807354
The correlation is:  0.8860693560632824
----------------------------------------
alpha: 4800000.01
Best RMSE is updated! 
The mean absolute error is:  21.50973671255723
The root mean squared error is:  39.894912549

Best RMSE is updated! 
The mean absolute error is:  21.122862619530803
The root mean squared error is:  38.74375236382562
The correlation is:  0.8913868129155494
----------------------------------------
alpha: 7700000.01
Best RMSE is updated! 
The mean absolute error is:  21.11195507591755
The root mean squared error is:  38.71139300684637
The correlation is:  0.8914889636959256
----------------------------------------
alpha: 7800000.01
Best RMSE is updated! 
The mean absolute error is:  21.10121970574625
The root mean squared error is:  38.679530203743866
The correlation is:  0.8915874937372875
----------------------------------------
alpha: 7900000.01
Best RMSE is updated! 
The mean absolute error is:  21.09065403817163
The root mean squared error is:  38.648155164177346
The correlation is:  0.8916825167563545
----------------------------------------
alpha: 8000000.01
Best RMSE is updated! 
The mean absolute error is:  21.080255630768523
The root mean squared error is:  38.6172592959

Best RMSE is updated! 
The mean absolute error is:  20.82689789182773
The root mean squared error is:  37.83875970009754
The correlation is:  0.8933858868126366
----------------------------------------
alpha: 11300000.01
Best RMSE is updated! 
The mean absolute error is:  20.82105309915387
The root mean squared error is:  37.81988356968346
The correlation is:  0.8934058205073047
----------------------------------------
alpha: 11400000.01
Best RMSE is updated! 
The mean absolute error is:  20.81530908102272
The root mean squared error is:  37.80128148953891
The correlation is:  0.8934243916752544
----------------------------------------
alpha: 11500000.01
Best RMSE is updated! 
The mean absolute error is:  20.80966440130572
The root mean squared error is:  37.78294931387453
The correlation is:  0.8934416317017353
----------------------------------------
alpha: 11600000.01
Best RMSE is updated! 
The mean absolute error is:  20.80411764693983
The root mean squared error is:  37.7648829759

Best RMSE is updated! 
The mean absolute error is:  20.667972157256678
The root mean squared error is:  37.295177343465774
The correlation is:  0.8934106741453943
----------------------------------------
alpha: 15000000.01
Best RMSE is updated! 
The mean absolute error is:  20.665056509950418
The root mean squared error is:  37.284192590590855
The correlation is:  0.8933956369714423
----------------------------------------
alpha: 15100000.01
Best RMSE is updated! 
The mean absolute error is:  20.662201295892864
The root mean squared error is:  37.273369707593346
The correlation is:  0.8933799727637807
----------------------------------------
alpha: 15200000.01
Best RMSE is updated! 
The mean absolute error is:  20.659405720523946
The root mean squared error is:  37.262706562689786
The correlation is:  0.8933636933125534
----------------------------------------
alpha: 15300000.01
Best RMSE is updated! 
The mean absolute error is:  20.656669001522346
The root mean squared error is:  37.2

Best RMSE is updated! 
The mean absolute error is:  20.594891205272525
The root mean squared error is:  36.98204230500472
The correlation is:  0.8925131951243126
----------------------------------------
alpha: 18700000.01
Best RMSE is updated! 
The mean absolute error is:  20.593764891073633
The root mean squared error is:  36.97586102277867
The correlation is:  0.892481113759578
----------------------------------------
alpha: 18800000.01
Best RMSE is updated! 
The mean absolute error is:  20.59267632649468
The root mean squared error is:  36.969781683917596
The correlation is:  0.892448706953324
----------------------------------------
alpha: 18900000.01
Best RMSE is updated! 
The mean absolute error is:  20.591625053320676
The root mean squared error is:  36.96380309575602
The correlation is:  0.8924159800781711
----------------------------------------
alpha: 19000000.01
Best RMSE is updated! 
The mean absolute error is:  20.590610619788446
The root mean squared error is:  36.9579240

Best RMSE is updated! 
The mean absolute error is:  20.575415617526968
The root mean squared error is:  36.80990213749356
The correlation is:  0.8911041155221023
----------------------------------------
alpha: 22500000.01
Best RMSE is updated! 
The mean absolute error is:  20.575472346114132
The root mean squared error is:  36.80688289393203
The correlation is:  0.8910627281086161
----------------------------------------
alpha: 22600000.01
Best RMSE is updated! 
The mean absolute error is:  20.575553114655467
The root mean squared error is:  36.80393028800561
The correlation is:  0.8910211635704874
----------------------------------------
alpha: 22700000.01
Best RMSE is updated! 
The mean absolute error is:  20.575657649458662
The root mean squared error is:  36.801043614663065
The correlation is:  0.890979424637681
----------------------------------------
alpha: 22800000.01
Best RMSE is updated! 
The mean absolute error is:  20.57578568033175
The root mean squared error is:  36.798222

Best RMSE is updated! 
The mean absolute error is:  20.593858936579558
The root mean squared error is:  36.73763663293486
The correlation is:  0.8894734350636392
----------------------------------------
alpha: 26200000.01
Best RMSE is updated! 
The mean absolute error is:  20.594772940821873
The root mean squared error is:  36.736675674983736
The correlation is:  0.8894270174114
----------------------------------------
alpha: 26300000.01
Best RMSE is updated! 
The mean absolute error is:  20.595702594223834
The root mean squared error is:  36.735760364766946
The correlation is:  0.8893804997629042
----------------------------------------
alpha: 26400000.01
Best RMSE is updated! 
The mean absolute error is:  20.596647724160686
The root mean squared error is:  36.73489025555788
The correlation is:  0.8893338836630356
----------------------------------------
alpha: 26500000.01
Best RMSE is updated! 
The mean absolute error is:  20.597608160034643
The root mean squared error is:  36.734064

alpha: 37800000.01
alpha: 37900000.01
alpha: 38000000.01
alpha: 38100000.01
alpha: 38200000.01
alpha: 38300000.01
alpha: 38400000.01
alpha: 38500000.01
alpha: 38600000.01
alpha: 38700000.01
alpha: 38800000.01
alpha: 38900000.01
alpha: 39000000.01
alpha: 39100000.01
alpha: 39200000.01
alpha: 39300000.01
alpha: 39400000.01
alpha: 39500000.01
alpha: 39600000.01
alpha: 39700000.01
alpha: 39800000.01
alpha: 39900000.01
alpha: 40000000.01
alpha: 40100000.01
alpha: 40200000.01
alpha: 40300000.01
alpha: 40400000.01
alpha: 40500000.01
alpha: 40600000.01
alpha: 40700000.01
alpha: 40800000.01
alpha: 40900000.01
alpha: 41000000.01
alpha: 41100000.01
alpha: 41200000.01
alpha: 41300000.01
alpha: 41400000.01
alpha: 41500000.01
alpha: 41600000.01
alpha: 41700000.01
alpha: 41800000.01
alpha: 41900000.01
alpha: 42000000.01
alpha: 42100000.01
alpha: 42200000.01
alpha: 42300000.01
alpha: 42400000.01
alpha: 42500000.01
alpha: 42600000.01
alpha: 42700000.01
alpha: 42800000.01
alpha: 42900000.01
alpha: 43000

alpha: 81200000.01
alpha: 81300000.01
alpha: 81400000.01
alpha: 81500000.01
alpha: 81600000.01
alpha: 81700000.01
alpha: 81800000.01
alpha: 81900000.01
alpha: 82000000.01
alpha: 82100000.01
alpha: 82200000.01
alpha: 82300000.01
alpha: 82400000.01
alpha: 82500000.01
alpha: 82600000.01
alpha: 82700000.01
alpha: 82800000.01
alpha: 82900000.01
alpha: 83000000.01
alpha: 83100000.01
alpha: 83200000.01
alpha: 83300000.01
alpha: 83400000.01
alpha: 83500000.01
alpha: 83600000.01
alpha: 83700000.01
alpha: 83800000.01
alpha: 83900000.01
alpha: 84000000.01
alpha: 84100000.01
alpha: 84200000.01
alpha: 84300000.01
alpha: 84400000.01
alpha: 84500000.01
alpha: 84600000.01
alpha: 84700000.01
alpha: 84800000.01
alpha: 84900000.01
alpha: 85000000.01
alpha: 85100000.01
alpha: 85200000.01
alpha: 85300000.01
alpha: 85400000.01
alpha: 85500000.01
alpha: 85600000.01
alpha: 85700000.01
alpha: 85800000.01
alpha: 85900000.01
alpha: 86000000.01
alpha: 86100000.01
alpha: 86200000.01
alpha: 86300000.01
alpha: 86400

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 150 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X2,train1_y,test1_X2,test1_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 103 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X3,train1_y,test1_X3,test1_y))

In [10]:
para =[0,28500000]
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  22.73847737393675
The root mean squared error is:  45.95805279347588
The correlation is:  0.7576646272128535
----------------------------------------
alpha: 28500000
Best RMSE is updated! 
The mean absolute error is:  20.619894363623906
The root mean squared error is:  36.72632035743006
The correlation is:  0.8883347201335979
----------------------------------------


## For the last 2 years as the testing set

In [12]:
train_size = 3653
corrs2 = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs2[i] = 0
    else:
        corrs2[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs2.shape,X.columns.shape,X.columns[0],X.columns[249]

((250, 1), (250,), ('flu',), ('flu incubation',))

In [13]:
#train2_X0,test2_X0 = corr_filter(train2_X,test2_X,corrs2,0.1)
train2_X1,val2_X1,test2_X1 = corr_filter(train2_X,val2_X,test2_X,corrs2,0.2)
#train2_X2,test2_X2 = corr_filter(train2_X,test2_X,corrs2,0.3)
#train2_X3,test2_X3 = corr_filter(train2_X,test2_X,corrs2,0.4)

In [14]:
#print('For correlation filter r>0.1, we select feature number: ',train2_X0.shape[1])
print('For correlation filter r>0.2, we select feature number: ',train2_X1.shape[1])
#print('For correlation filter r>0.3, we select feature number: ',train2_X2.shape[1])
#print('For correlation filter r>0.4, we select feature number: ',train2_X3.shape[1])

For correlation filter r>0.2, we select feature number:  157


# scaling and modeling
scaler = MinMaxScaler()
train2_X0_scaled = scaler.fit_transform(train2_X0)
test2_X0_scaled = scaler.transform(test2_X0)

train2_X1_scaled = scaler.fit_transform(train2_X1)
test2_X1_scaled = scaler.transform(test2_X1)

train2_X2_scaled = scaler.fit_transform(train2_X2)
test2_X2_scaled = scaler.transform(test2_X2)

train2_X3_scaled = scaler.fit_transform(train2_X3)
test2_X3_scaled = scaler.transform(test2_X3)


para = range(1,100)
para = np.arange(0.01, 10.0, 0.01)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

In [15]:

para =np.arange(0, 10000000, 10000)
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train2_X1,train2_y,val2_X1,val2_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  4.824141181729265
The root mean squared error is:  10.31516320934834
The correlation is:  0.38850181146733165
----------------------------------------
alpha: 10000
Best RMSE is updated! 
The mean absolute error is:  4.805691092591468
The root mean squared error is:  10.249416705583524
The correlation is:  0.39021113735396473
----------------------------------------
alpha: 20000
Best RMSE is updated! 
The mean absolute error is:  4.787762372771287
The root mean squared error is:  10.184086394066915
The correlation is:  0.39197029246842957
----------------------------------------
alpha: 30000
Best RMSE is updated! 
The mean absolute error is:  4.7702625304282105
The root mean squared error is:  10.119651214469608
The correlation is:  0.39375502509385285
----------------------------------------
alpha: 40000
Best RMSE is updated! 
The mean absolute error is:  4.753262350705389
The root mean squared error is:  10.056373408476652
T

Best RMSE is updated! 
The mean absolute error is:  4.30168492186743
The root mean squared error is:  8.254514704931275
The correlation is:  0.4602506871327607
----------------------------------------
alpha: 500000
Best RMSE is updated! 
The mean absolute error is:  4.295606670127988
The root mean squared error is:  8.229177179480727
The correlation is:  0.46136570266405214
----------------------------------------
alpha: 510000
Best RMSE is updated! 
The mean absolute error is:  4.289609570136613
The root mean squared error is:  8.204233394628172
The correlation is:  0.4624703215873956
----------------------------------------
alpha: 520000
Best RMSE is updated! 
The mean absolute error is:  4.283715057597803
The root mean squared error is:  8.179673941312164
The correlation is:  0.4635647237458155
----------------------------------------
alpha: 530000
Best RMSE is updated! 
The mean absolute error is:  4.277906342981061
The root mean squared error is:  8.155489723131236
The correlation

Best RMSE is updated! 
The mean absolute error is:  4.023897207724061
The root mean squared error is:  7.122180405097829
The correlation is:  0.5187329657183491
----------------------------------------
alpha: 1180000
Best RMSE is updated! 
The mean absolute error is:  4.021194203575036
The root mean squared error is:  7.111397542391702
The correlation is:  0.5193992149344767
----------------------------------------
alpha: 1190000
Best RMSE is updated! 
The mean absolute error is:  4.018513358754294
The root mean squared error is:  7.1007262211590945
The correlation is:  0.5200612592153759
----------------------------------------
alpha: 1200000
Best RMSE is updated! 
The mean absolute error is:  4.0158543614185245
The root mean squared error is:  7.090164848667668
The correlation is:  0.5207191398525303
----------------------------------------
alpha: 1210000
Best RMSE is updated! 
The mean absolute error is:  4.013216906043544
The root mean squared error is:  7.079711864260446
The corre

Best RMSE is updated! 
The mean absolute error is:  3.9345270583820575
The root mean squared error is:  6.77941965175113
The correlation is:  0.5413964866607333
----------------------------------------
alpha: 1560000
Best RMSE is updated! 
The mean absolute error is:  3.932490286943623
The root mean squared error is:  6.771975594067722
The correlation is:  0.5419269007081817
----------------------------------------
alpha: 1570000
Best RMSE is updated! 
The mean absolute error is:  3.9304671919667507
The root mean squared error is:  6.764599607371239
The correlation is:  0.5424542827696758
----------------------------------------
alpha: 1580000
Best RMSE is updated! 
The mean absolute error is:  3.928457619388849
The root mean squared error is:  6.7572908974524575
The correlation is:  0.5429786553572972
----------------------------------------
alpha: 1590000
Best RMSE is updated! 
The mean absolute error is:  3.926461417707689
The root mean squared error is:  6.750048683038685
The corre

Best RMSE is updated! 
The mean absolute error is:  3.840693235873784
The root mean squared error is:  6.449254623658899
The correlation is:  0.5669579311365761
----------------------------------------
alpha: 2120000
Best RMSE is updated! 
The mean absolute error is:  3.8393276568209247
The root mean squared error is:  6.444678433167453
The correlation is:  0.5673462835201494
----------------------------------------
alpha: 2130000
Best RMSE is updated! 
The mean absolute error is:  3.837969472727764
The root mean squared error is:  6.440140104949817
The correlation is:  0.5677325086429532
----------------------------------------
alpha: 2140000
Best RMSE is updated! 
The mean absolute error is:  3.836618620073237
The root mean squared error is:  6.435639284758182
The correlation is:  0.5681166179827064
----------------------------------------
alpha: 2150000
Best RMSE is updated! 
The mean absolute error is:  3.835275036166204
The root mean squared error is:  6.43117562279724
The correla

Best RMSE is updated! 
The mean absolute error is:  3.7770041419975366
The root mean squared error is:  6.245795008303055
The correlation is:  0.5854263723767039
----------------------------------------
alpha: 2670000
Best RMSE is updated! 
The mean absolute error is:  3.776097075283831
The root mean squared error is:  6.242871180290046
The correlation is:  0.5857123260123566
----------------------------------------
alpha: 2680000
Best RMSE is updated! 
The mean absolute error is:  3.775194265280731
The root mean squared error is:  6.239970818123291
The correlation is:  0.5859966541854502
----------------------------------------
alpha: 2690000
Best RMSE is updated! 
The mean absolute error is:  3.774295682810155
The root mean squared error is:  6.237093732803445
The correlation is:  0.586279364170049
----------------------------------------
alpha: 2700000
Best RMSE is updated! 
The mean absolute error is:  3.773401299003114
The root mean squared error is:  6.2342397372697285
The correl

Best RMSE is updated! 
The mean absolute error is:  3.7297650255518864
The root mean squared error is:  6.1062552618943675
The correlation is:  0.5999344950341085
----------------------------------------
alpha: 3270000
Best RMSE is updated! 
The mean absolute error is:  3.7290918521335072
The root mean squared error is:  6.104460571948228
The correlation is:  0.6001343763898823
----------------------------------------
alpha: 3280000
Best RMSE is updated! 
The mean absolute error is:  3.7284215614230978
The root mean squared error is:  6.1026807248510675
The correlation is:  0.6003329993490684
----------------------------------------
alpha: 3290000
Best RMSE is updated! 
The mean absolute error is:  3.7277541372981844
The root mean squared error is:  6.100915613507546
The correlation is:  0.6005303690821896
----------------------------------------
alpha: 3300000
Best RMSE is updated! 
The mean absolute error is:  3.727089563776012
The root mean squared error is:  6.099165131749858
The c

Best RMSE is updated! 
The mean absolute error is:  3.7079828156727395
The root mean squared error is:  6.051617155290845
The correlation is:  0.6062142203934505
----------------------------------------
alpha: 3620000
Best RMSE is updated! 
The mean absolute error is:  3.707417284611414
The root mean squared error is:  6.050283823787075
The correlation is:  0.6063729845153814
----------------------------------------
alpha: 3630000
Best RMSE is updated! 
The mean absolute error is:  3.706854120002499
The root mean squared error is:  6.0489620786700735
The correlation is:  0.6065306579310015
----------------------------------------
alpha: 3640000
Best RMSE is updated! 
The mean absolute error is:  3.7062933098595763
The root mean squared error is:  6.0476518400468295
The correlation is:  0.6066872450678726
----------------------------------------
alpha: 3650000
Best RMSE is updated! 
The mean absolute error is:  3.7057348422906533
The root mean squared error is:  6.046353028665678
The co

Best RMSE is updated! 
The mean absolute error is:  3.6886544920629842
The root mean squared error is:  6.009419328528459
The correlation is:  0.6113953323588218
----------------------------------------
alpha: 3990000
Best RMSE is updated! 
The mean absolute error is:  3.6881742559850927
The root mean squared error is:  6.0084660690206455
The correlation is:  0.6115165552548382
----------------------------------------
alpha: 4000000
Best RMSE is updated! 
The mean absolute error is:  3.6876959903421658
The root mean squared error is:  6.007521821306733
The correlation is:  0.6116368400270662
----------------------------------------
alpha: 4010000
Best RMSE is updated! 
The mean absolute error is:  3.6872196860572357
The root mean squared error is:  6.0065865254654955
The correlation is:  0.6117561905034855
----------------------------------------
alpha: 4020000
Best RMSE is updated! 
The mean absolute error is:  3.6867453341173815
The root mean squared error is:  6.005660122023967
The 

Best RMSE is updated! 
The mean absolute error is:  3.672149101673876
The root mean squared error is:  5.9797129128318325
The correlation is:  0.6152846151829571
----------------------------------------
alpha: 4360000
Best RMSE is updated! 
The mean absolute error is:  3.6717432467037474
The root mean squared error is:  5.979056413112662
The correlation is:  0.6153735700416951
----------------------------------------
alpha: 4370000
Best RMSE is updated! 
The mean absolute error is:  3.671339053294681
The root mean squared error is:  5.978406977586529
The correlation is:  0.6154617193916136
----------------------------------------
alpha: 4380000
Best RMSE is updated! 
The mean absolute error is:  3.6709365143998305
The root mean squared error is:  5.977764560509665
The correlation is:  0.6155490665745874
----------------------------------------
alpha: 4390000
Best RMSE is updated! 
The mean absolute error is:  3.6705356230173076
The root mean squared error is:  5.977129116461137
The cor

Best RMSE is updated! 
The mean absolute error is:  3.6581871301849485
The root mean squared error is:  5.959791960295942
The correlation is:  0.6180646706837485
----------------------------------------
alpha: 4730000
Best RMSE is updated! 
The mean absolute error is:  3.657838385885351
The root mean squared error is:  5.959368712375629
The correlation is:  0.6181259572798401
----------------------------------------
alpha: 4740000
Best RMSE is updated! 
The mean absolute error is:  3.6574910696327008
The root mean squared error is:  5.9589510318498276
The correlation is:  0.6181865543853364
----------------------------------------
alpha: 4750000
Best RMSE is updated! 
The mean absolute error is:  3.6571451758142297
The root mean squared error is:  5.958538883311238
The correlation is:  0.6182464649282234
----------------------------------------
alpha: 4760000
Best RMSE is updated! 
The mean absolute error is:  3.6568006988499024
The root mean squared error is:  5.958132231590913
The co

Best RMSE is updated! 
The mean absolute error is:  3.647283288886468
The root mean squared error is:  5.948071047076312
The correlation is:  0.6198180846500552
----------------------------------------
alpha: 5080000
Best RMSE is updated! 
The mean absolute error is:  3.6470363911512766
The root mean squared error is:  5.947823058642035
The correlation is:  0.6198569149134224
----------------------------------------
alpha: 5090000
Best RMSE is updated! 
The mean absolute error is:  3.646790650995456
The root mean squared error is:  5.94757952882297
The correlation is:  0.6198951521962637
----------------------------------------
alpha: 5100000
Best RMSE is updated! 
The mean absolute error is:  3.6465460640969227
The root mean squared error is:  5.9473404295482
The correlation is:  0.6199327990821702
----------------------------------------
alpha: 5110000
Best RMSE is updated! 
The mean absolute error is:  3.646302626156508
The root mean squared error is:  5.947105732928036
The correlat

Best RMSE is updated! 
The mean absolute error is:  3.6350284464066003
The root mean squared error is:  5.940112572676759
The correlation is:  0.6212154564007143
----------------------------------------
alpha: 5730000
Best RMSE is updated! 
The mean absolute error is:  3.6348769918755415
The root mean squared error is:  5.9401031321215045
The correlation is:  0.6212207304625794
----------------------------------------
alpha: 5740000
Best RMSE is updated! 
The mean absolute error is:  3.6347263656352817
The root mean squared error is:  5.940096652295326
The correlation is:  0.6212255613782539
----------------------------------------
alpha: 5750000
Best RMSE is updated! 
The mean absolute error is:  3.634576564871731
The root mean squared error is:  5.940093114592926
The correlation is:  0.6212299511839968
----------------------------------------
alpha: 5760000
Best RMSE is updated! 
The mean absolute error is:  3.6344275867846867
The root mean squared error is:  5.9400925005237015
The c

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.3, 154 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X2,train2_y,test2_X2,test2_y))

para = [10]
para_l=[0.3]
print('For correlation filter r>=0.4, 105 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X3,train2_y,test2_X3,test2_y))

In [16]:
## tuned alpha based on validation set, then measure the performance for test set.
## Fixing l1-ratio = 0.3
para = [0,5760000]
rmse=100000
mae=1000000
corr=0
for a in para:
    print('alpha:',a)
    rmse0,mae0,corr0 = ridge(a,train1_X1,train1_y,val1_X1,val1_y)
    if rmse0< rmse:
        rmse=rmse0
        mae=mae0
        corr = corr0
        print('========================================')
        print('Best RMSE is updated! ' )
        print('The mean absolute error is: ',mae)
        print('The root mean squared error is: ',rmse)
        print('The correlation is: ',corr)
        print('----------------------------------------')

alpha: 0
Best RMSE is updated! 
The mean absolute error is:  22.73847737393675
The root mean squared error is:  45.95805279347588
The correlation is:  0.7576646272128535
----------------------------------------
alpha: 5760000
Best RMSE is updated! 
The mean absolute error is:  21.35787124172689
The root mean squared error is:  39.43994758867146
The correlation is:  0.8886819918013812
----------------------------------------
