## tasks
### 1: Form training and testing periods as discussed (2 test periods, last 2*365 days). Be careful tp scale data only based on data from the training set. You may use (from Python) MinMaxScaler, standardisation (StandardScaler) or the default scaler in Python's elastic net.
### 2:Estimate the current flu rates by training an elastic net model. Use a Pearson correlation filter (r > 0.3) on the training data to reduce the amount of queries prior to training an elastic net (reminder: not all 1000 queries I provided are related to flu!). Report performance on the two test sets using three metrics: mean absolute error, root mean squared error and Pearson's correlation.
### 3: If there is time, begin work on traditional forecasting models (you've identified seasonal ARIMA and Hult-Winters).

In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import csv
import scipy.stats as stats
import seaborn as sns
from collections import Counter
from collections import defaultdict
import math
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from random import randint

import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm

# initialize the plotting sizes
# set size
plt.rc('figure', figsize=(10, 5))
# subplots size
fizsize_with_subplots = (10, 10)
# histogram size
bin_size = 10

In [2]:
# Loading data
dates = pd.read_csv('data/dates.csv',header=None)
queries = pd.read_csv('data/queries.csv',header=None)
X = pd.read_csv('data/X.csv',header=None)
y = pd.read_csv('data/y.csv',header=None)

X.columns = queries

In [3]:
# data preprocessing and train-test split
# this data is well formed with no missing value and other symbols or labels that are non numerical.
# splitting the data into train and test

# Here we want the first 500 queries
X = X.iloc[:, 0: 500]

# first with the last year as test
test_size =365
length = X.shape[0]
train1_X = X[0:length-test_size]
train1_y = y[0:length-test_size]
test1_X = X[length-test_size:]
test1_y = y[length-test_size:]

# second with the last 2 year as test
test_size2 =365*2
train2_X = X[0:length-test_size2]
train2_y = y[0:length-test_size2]
test2_X = X[length-test_size2:]
test2_y = y[length-test_size2:]

print('X shape: ',X.shape,'  y shape: ',y.shape)
print('train1 X:',train1_X.shape,' train1 y:',train1_y.shape,' Test1 X shape:',test1_X.shape, ' Test1 y shape:',test1_y.shape)
print('train2 X:',train2_X.shape,' train2 y:',train2_y.shape,' Test2 X:',test2_X.shape,' Test2 y:',test2_y.shape)

X shape:  (4383, 500)   y shape:  (4383, 1)
train1 X: (4018, 500)  train1 y: (4018, 1)  Test1 X shape: (365, 500)  Test1 y shape: (365, 1)
train2 X: (3653, 500)  train2 y: (3653, 1)  Test2 X: (730, 500)  Test2 y: (730, 1)


In [4]:
train_size = 4018
corrs = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs[i] = 0
    else:
        corrs[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs.shape,X.columns.shape,X.columns[0],X.columns[499]

((500, 1), (500,), ('flu',), ('symptoms of kidney infection',))

In [5]:
def corr_filter(df_X,df_Xtest,corrs,threshold):
    X = df_X.copy()
    X_test = df_Xtest.copy()
    col_corr = set() # Set of all the names of deleted columns
    for i in range(0,500):
            if corrs[i,0] < threshold: 
                #print(X.columns[i])
                colname = df_X.columns[i]
                del X[colname] # deleting the column from the dataset
                del X_test[colname] 

    return X,X_test

In [6]:
train1_X0,test1_X0 = corr_filter(train1_X,test1_X,corrs,0.1)
train1_X1,test1_X1 = corr_filter(train1_X,test1_X,corrs,0.2)
train1_X2,test1_X2 = corr_filter(train1_X,test1_X,corrs,0.3)
train1_X3,test1_X3 = corr_filter(train1_X,test1_X,corrs,0.4)

In [7]:
print('For correlation filter r>=0.1, we select feature number: ',train1_X0.shape[1])
print('For correlation filter r>=0.2, we select feature number: ',train1_X1.shape[1])
print('For correlation filter r>=0.3, we select feature number: ',train1_X2.shape[1])
print('For correlation filter r>=0.4, we select feature number: ',train1_X3.shape[1])

For correlation filter r>=0.1, we select feature number:  310
For correlation filter r>=0.2, we select feature number:  212
For correlation filter r>=0.3, we select feature number:  150
For correlation filter r>=0.4, we select feature number:  103


In [8]:
# Defien the mearure matrics, MAE, RMSE, CORR
# define three metrics: mean absolute error, root mean squared error and Pearson's correlation.
from sklearn.metrics import mean_absolute_error
# mae = mean_absolute_error(y_actual, y_pred)

from sklearn.metrics import mean_squared_error
from math import sqrt
# rmse = sqrt(mean_squared_error(y_actual, y_pred))

# np.correcoef returns Pearson product-moment correlation coefficients
def pearson_r(x,y):   
    corr_mat = np.corrcoef(x,y)
    return corr_mat[0,1]
# r = pearson_r(y_actual,y_pred)


# Generalise the function for convinient tuning
def eNet(a,l,train_X,train_y,test_X,test_y):
    
    # scaling and modeling
    scalerX = StandardScaler()
    scalerX.fit(train_X)
    train_X = scalerX.transform(train_X)
    test_X = scalerX.transform(test_X)
    
    scalery = StandardScaler() 
    scalery.fit(train_y) 
    train_y = scalery.transform(train_y) 
    #test_y = scalery.transform(test_y)
    
    #alpha = a
    #l1_ratio = l
    enet = ElasticNet(alpha=a, copy_X=True, fit_intercept=True, l1_ratio=l,
      max_iter=10000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
    enet.fit(train_X,train_y)
    
    print('Nonzero weights: %d from %d' % (len(np.nonzero(enet.coef_)[0]), len(enet.coef_)))
    y_pred1 = enet.predict(test_X)
    
    y_pred1 = scalery.inverse_transform(y_pred1)

    #print(y_pred1.shape,test_y[:,0].shape)
    
    mae1 = mean_absolute_error(test_y, y_pred1)
    print('The mean absolute error is: ',mae1)

    rmse1 = sqrt(mean_squared_error(test_y, y_pred1))
    print('The root mean squared error is: ',rmse1)
    
    corr_y = test_y.copy()
    corr_y['y_act'] = test_y
    corr_y['y_pred']= y_pred1
    print('The correlation is: ',np.corrcoef(corr_y['y_act'],corr_y['y_pred'])[0,1])
    
    total = mae1+rmse1- np.corrcoef(corr_y['y_act'],corr_y['y_pred'])[0,1]
    print(total)
    
    


In [9]:
para = [0.01]
para_l=[0.4]
print('For correlation filter r>=0.1, 310 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X0,train1_y,test1_X0,test1_y))

For correlation filter r>=0.1, 310 features are selected
alpha: 0.01  L1-ratio: 0.4
Nonzero weights: 182 from 310
The mean absolute error is:  2.6303552248996027
The root mean squared error is:  3.247224679968285
The correlation is:  0.917503448908422
4.960076455959467
None


In [11]:
para = [1.09]
para_l=[0.3]
print('For correlation filter r>=0.2, 212 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X1,train1_y,test1_X1,test1_y))

For correlation filter r>=0.2, 212 features are selected
alpha: 1.09  L1-ratio: 0.3
Nonzero weights: 19 from 212
The mean absolute error is:  4.534601429940703
The root mean squared error is:  5.02256044172397
The correlation is:  0.8326618376341202
8.724500034030552
None


In [28]:
para = [0.01]
para_l=[0.01]
print('For correlation filter r>=0.3, 150 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X2,train1_y,test1_X2,test1_y))

For correlation filter r>=0.3, 150 features are selected
alpha: 0.01  L1-ratio: 0.01
Nonzero weights: 149 from 150
The mean absolute error is:  2.6891096524313056
The root mean squared error is:  3.2349801041664223
The correlation is:  0.9239579225739746
5.000131834023754
None


In [31]:
para = [0.01]
para_l=[0.01]
print('For correlation filter r>=0.4, 103 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train1_X3,train1_y,test1_X3,test1_y))

For correlation filter r>=0.4, 103 features are selected
alpha: 0.01  L1-ratio: 0.01
Nonzero weights: 102 from 103
The mean absolute error is:  2.8430629849367426
The root mean squared error is:  3.289934057935316
The correlation is:  0.9152612670397793
5.2177357758322795
None


## For the last 2 years as the testing set

In [32]:
train_size = 3653
corrs2 = np.zeros((X.shape[1],1))
for i in range(0,X.shape[1]):
    if X.sum(axis=1)[i] == 0:
        corrs2[i] = 0
    else:
        corrs2[i] = np.corrcoef(X.iloc[0:train_size,i],y.iloc[0:train_size,0])[0,1]
        
corrs2.shape,X.columns.shape,X.columns[0],X.columns[499]

((500, 1), (500,), ('flu',), ('symptoms of kidney infection',))

In [33]:
train2_X0,test2_X0 = corr_filter(train2_X,test2_X,corrs2,0.1)
train2_X1,test2_X1 = corr_filter(train2_X,test2_X,corrs2,0.2)
train2_X2,test2_X2 = corr_filter(train2_X,test2_X,corrs2,0.3)
train2_X3,test2_X3 = corr_filter(train2_X,test2_X,corrs2,0.4)

In [34]:
print('For correlation filter r>0.1, we select feature number: ',train2_X0.shape[1])
print('For correlation filter r>0.2, we select feature number: ',train2_X1.shape[1])
print('For correlation filter r>0.3, we select feature number: ',train2_X2.shape[1])
print('For correlation filter r>0.4, we select feature number: ',train2_X3.shape[1])

For correlation filter r>0.1, we select feature number:  303
For correlation filter r>0.2, we select feature number:  215
For correlation filter r>0.3, we select feature number:  154
For correlation filter r>0.4, we select feature number:  105


In [43]:
para = [0.01]
para_l=[0.26]
print('For correlation filter r>=0.1, 303 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X0,train2_y,test2_X0,test2_y))

For correlation filter r>=0.1, 303 features are selected
alpha: 0.01  L1-ratio: 0.26
Nonzero weights: 212 from 303
The mean absolute error is:  2.523319760568521
The root mean squared error is:  3.266459986228576
The correlation is:  0.875528619721812
4.914251127075285
None


In [47]:
para = [0.01]
para_l=[0.12]
print('For correlation filter r>=0.2, 215 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X1,train2_y,test2_X1,test2_y))

For correlation filter r>=0.2, 215 features are selected
alpha: 0.01  L1-ratio: 0.12
Nonzero weights: 183 from 215
The mean absolute error is:  2.627647539596773
The root mean squared error is:  3.349693597355846
The correlation is:  0.8704305105179881
5.106910626434631
None


In [50]:
para = [0.01]
para_l=[0.01]
print('For correlation filter r>=0.3, 154 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X2,train2_y,test2_X2,test2_y))

For correlation filter r>=0.3, 154 features are selected
alpha: 0.01  L1-ratio: 0.01
Nonzero weights: 151 from 154
The mean absolute error is:  2.6122891594499604
The root mean squared error is:  3.3028834261242697
The correlation is:  0.8827328622776955
5.032439723296535
None


In [53]:
para = [0.01]
para_l=[0.01]
print('For correlation filter r>=0.4, 105 features are selected')
for a in para:
    for l in para_l:
        print('alpha:',a,' L1-ratio:',l)
        print(eNet(a,l,train2_X3,train2_y,test2_X3,test2_y))

For correlation filter r>=0.4, 105 features are selected
alpha: 0.01  L1-ratio: 0.01
Nonzero weights: 105 from 105
The mean absolute error is:  2.8293170011843682
The root mean squared error is:  3.365114670311403
The correlation is:  0.8937143146905017
5.3007173568052695
None
