In [30]:
import numpy as np
from numpy import load
import random
from sklearn.linear_model import Ridge
from fancyimpute import SoftImpute, BiScaler
from sklearn.linear_model import LinearRegression as reg
from sklearn.linear_model import ElasticNetCV
import itertools
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
import warnings
import os, sys
warnings.filterwarnings('ignore')

data=np.load('august2020-exercise1.npz')

#print(data.files) # view the files in the npz

#function to normalize the data
def normalize(matrix):
    normed=(matrix-matrix.mean(axis=0))/matrix.std(axis=0)
    return normed

######## train data ########
X_lr_small_train=normalize(data['X_lr_small_train'])
X_lr_big_train=normalize(data['X_lr_big_train'])
X_hr_small_train=normalize(data['X_hr_small_train'])
X_hr_big_train=normalize(data['X_hr_big_train'])

y_lr_small_train=normalize(data['y_lr_small_train'])
y_lr_big_train=normalize(data['y_lr_big_train'])
y_hr_small_train=normalize(data['y_hr_small_train'])
y_hr_big_train=normalize(data['y_hr_big_train'])
######## test data#########
X_lr_test=normalize(data['X_lr_test'])
X_hr_test=normalize(data['X_hr_test'])

y_lr_test=normalize(data['y_lr_test'])
y_hr_test=normalize(data['y_hr_test'])

def compRandomFunc(array,probability): # probability=0.7
    n=array.shape[0]
    p=array.shape[1]
    array=array.astype("float")
    for i in range(n):    
        for j in range(p):
            r=random.uniform(0, 1)
            if r < probability:
                array[i,j]=np.NaN
    return array

def missingByFeature(array,p1,p2): #p1=0.875, p2=0.8
    n=array.shape[0]
    p=array.shape[1]
    array=array.astype("float")
    featureList=np.array([])
    for i in range(p):
        r = random.uniform(0, 1)
        if r<p1:
            featureList=np.append(featureList,i)
    for i in featureList:  # i is the counter for feature not sample size
        i=int(i)
        for j in range(n):
            r2=random.uniform(0,1)
            if r2<p2:
                array[j,i]=np.NaN
    return array

def meanImputation(feature,data):
    imputed=np.nanmean(data[:,feature])
    return imputed

# creating data with missing values
import math
probability=0.7
p1=0.875
p2=0.8
# return the imputed matrix with mean imputation
def imputetMissing(matrix):
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            if math.isnan(matrix[i,j]) is True:
                matrix[i,j]=meanImputation(j,matrix)
    return matrix

def test(models, X_train,y_train,X_test,y_test, iterations = 100):
    results = {}
    for i in models:
        r2_train = []
        r2_test = []
        for j in range(iterations):
            r2_test.append(metrics.r2_score(y_test,
                                            models[i].fit(X_train, 
                                                         y_train).predict(X_test)))

            r2_train.append(metrics.r2_score(y_train, 
                                             models[i].fit(X_train, 
                                                          y_train).predict(X_train)))
        results[i] = [np.mean(r2_train),np.mean(r2_test)]
    return pd.DataFrame(results)

def importantFeatures(best_model,X_train,y_train,BS_size):
    importantFeatures=np.array([])
    
    for i in range(100):
        #creatig bootstrap boots
        x_bag=np.empty((0,X_train.shape[1]))
        y_bag=np.array([])
        for j in range(BS_size): 
            randIndex=np.random.randint(len(X_train), size=1)
            x_sample=X_train[randIndex,:]
            y_sample=y_train[randIndex]
                
            x_bag=np.concatenate((x_bag,x_sample))
            y_bag=np.append(y_bag,y_sample)             
        # test the estimator on bootstrap sample 100 times
        #skippa grid search och kör dirr på bästa hyperparametrarna för att spara tid
        bestEstimator=best_model.fit(x_bag,y_bag)
        
        rfeBest=RFE(bestEstimator)
        X_rfe=rfeBest.fit_transform(x_bag,y_bag)
        bestEstimator.fit(X_rfe,y_bag)
        
        importantFeatures=np.append(importantFeatures,np.where(rfeBest.ranking_==1))
        
    return importantFeatures

def featureImportance(featureList):
    unique_elements, counts_elements = np.unique(featureList, return_counts=True)
    return unique_elements, counts_elements

def mostCommonFeatures(occurrence):
    mostCommonIndex=np.array([])
    for i in range(len(occurrence)):
        if occurrence[i]>=80:
            mostCommonIndex=np.append(mostCommonIndex,[i])
    mostCommonIndex=np.array(mostCommonIndex,dtype=np.int8)
    mostCommonFeatures=featureIndex[mostCommonIndex]
    return mostCommonFeatures

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout



In [20]:
models = {'OLS': linear_model.LinearRegression(),
         'Lasso': linear_model.Lasso(),
         'Ridge': linear_model.Ridge(),}

lasso_params = {'alpha':[0.02, 0.024, 0.025, 0.026, 0.03]}
ridge_params = {'alpha':[200, 250, 300, 400, 500]}
eNet_params={'alpha': [0, 0.5, 0.1, 0.01, 0.001],
             'l1_ratio': [0, 0.25, 0.5, 0.75, 1]}

#grid search for high rate big data
models4 = {'OLS': linear_model.LinearRegression(),
           'Lasso': GridSearchCV(linear_model.Lasso(), 
                               param_grid=lasso_params).fit(X_hr_big_train,y_hr_big_train).best_estimator_,
           'Ridge': GridSearchCV(linear_model.Ridge(), 
                               param_grid=ridge_params).fit(X_hr_big_train,y_hr_big_train).best_estimator_,
          'Elastic net':GridSearchCV(linear_model.ElasticNet(), 
                               param_grid=eNet_params).fit(X_hr_big_train,y_hr_big_train).best_estimator_}

print('high rate big data')
test(models4, X_hr_big_train,y_hr_big_train,X_hr_test,y_hr_test)
print(GridSearchCV(linear_model.ElasticNet(),param_grid=eNet_params).fit(X_hr_big_train,y_hr_big_train).best_estimator_)

high rate big data
ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=1,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)


In [37]:
bestModel=ElasticNet(l1_ratio=1,alpha=0.01)
bestModel.fit(X_hr_big_train,y_hr_big_train)
print('Best model score:',bestModel.score(X_hr_test,y_hr_test))
#print('Best model coefficients:',bestModel.coef_)

Best model score: 0.3277928784922103


In [38]:
bestModel.coef_

array([-3.98059472e-03,  2.97842846e-03,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  1.11043853e-03,  0.00000000e+00,
        0.00000000e+00,  7.15773296e-03,  9.28168764e-03, -0.00000000e+00,
        0.00000000e+00, -6.22823546e-03, -8.60493095e-03, -0.00000000e+00,
        0.00000000e+00,  2.76826628e-02, -0.00000000e+00,  2.73932387e-02,
       -7.18300214e-03, -4.54331138e-03,  6.71100963e-02,  0.00000000e+00,
       -2.22491822e-03, -8.01775808e-03,  0.00000000e+00,  1.05994721e-02,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        6.05679793e-02,  0.00000000e+00, -2.13372586e-02, -2.66520158e-04,
       -8.92456517e-03,  2.47391817e-04,  0.00000000e+00,  1.11555115e-01,
        5.52255814e-03, -0.00000000e+00,  0.00000000e+00,  7.25237241e-02,
        1.00403524e-02, -2.24215480e-03,  0.00000000e+00, -2.96248255e-03,
       -3.79479840e-03,  7.99407915e-02,  0.00000000e+00, -5.09904945e-03,
        9.75704921e-04, -

In [35]:
BS_size_big=1000
feature_hr_big=importantFeatures(bestModel,X_hr_big_train,y_hr_big_train,BS_size_big)
featureIndex,occurrence=featureImportance(feature_hr_big)
print(mostCommonFeatures(occurrence))

[ 22.  32.  39.  43.  49.  97. 110.  81.  83.  92. 116. 118. 120.]


In [36]:
len(mostCommonFeatures(occurrence))

13

In [31]:
#random missing and softImpute
softImpute = SoftImpute()
biscaler = BiScaler()
softImputeScore=np.array([])
scale=np.array([0.1,0.25,0.5,0.75])

for i in range(10):
    with HiddenPrints():
        X_randomMissingSoft=compRandomFunc(X_hr_big_train,probability) #here is the line where you enable missing data
        X_randomMissingSoft_normalized = biscaler.fit_transform(X_randomMissingSoft)
        
        lambda_0=np.nanmax(X_randomMissingSoft_normalized)
        bestOfLambdaScore=np.array([])
        for j in range(len(scale)):
            softImpute=SoftImpute(shrinkage_value=lambda_0*scale[j])
            
            X_imputed_soft_normalized = softImpute.fit_transform(X_randomMissingSoft_normalized)
            X_imputed_soft = biscaler.inverse_transform(X_imputed_soft_normalized)
            
            bestModel.fit(X_imputed_soft,y_hr_big_train)
            bestOfLambdaScore=np.append(bestOfLambdaScore,bestModel.score(X_hr_test,y_hr_test))
            
        bestLambdaIndex=np.where(bestOfLambdaScore==np.max(bestOfLambdaScore))
        softImpute=SoftImpute(shrinkage_value=lambda_0*scale[bestLambdaIndex])
        X_imputed_soft_normalized = softImpute.fit_transform(X_randomMissingSoft_normalized)
        X_imputed_soft = biscaler.inverse_transform(X_imputed_soft_normalized)
        
        softImputeScore=np.append(softImputeScore,np.max(bestOfLambdaScore))
        
    #do the feature selection thing
    softImpute_feature=importantFeatures(bestModel,X_imputed_soft,y_hr_big_train,BS_size_big)
    softImputed_featureIndex,softImputed_occurrence=featureImportance(softImpute_feature)
    print('There are',len(mostCommonFeatures(softImputed_occurrence)),'important features',flush=True)
    print(mostCommonFeatures(softImputed_occurrence),flush=True)
print(softImputeScore,flush=True)

There are 9 important features
[ 22.  39.  43.  97. 110.  81.  92. 116. 120.]
There are 8 important features
[ 39.  97. 110.  81.  92. 112. 116. 120.]
There are 9 important features
[ 39.  49.  97. 110.  81.  83.  92. 116. 120.]
There are 9 important features
[ 19.  39.  97. 110.  81.  92.  93. 116. 120.]
There are 8 important features
[ 39.  49.  97. 110.  81.  92. 116. 120.]
There are 9 important features
[ 22.  39.  49.  97. 110.  81.  92. 116. 120.]
There are 7 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 7 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 8 important features
[  0.  39.  97. 110.  81.  92. 116. 120.]
There are 8 important features
[ 39.  43.  97. 110.  81.  92. 116. 120.]
[0.28863611 0.27516068 0.26625792 0.28166749 0.28822205 0.29232699
 0.28869516 0.28115911 0.28970304 0.2863328 ]


In [32]:
#softImpute with missing by feature
softImputeMBFScore=np.array([])
for i in range(10):
    with HiddenPrints():
        X_MBF_soft=missingByFeature(X_hr_big_train,p1,p2)
        X_MBF_soft_normalized=biscaler.fit_transform(X_MBF_soft)
        
        lambda_0=np.nanmax(X_randomMissingSoft_normalized)
        bestOfLambdaScore=np.array([])
        for j in range(len(scale)):
            softImpute=SoftImpute(shrinkage_value=lambda_0*scale[j])
            
            X_MBF_imputed_soft_normalized=softImpute.fit_transform(X_MBF_soft_normalized)
            X_MBF_imputed_soft=biscaler.inverse_transform(X_MBF_imputed_soft_normalized)
            
            bestModel.fit(X_MBF_imputed_soft,y_hr_big_train)
            bestOfLambdaScore=np.append(bestOfLambdaScore,bestModel.score(X_hr_test,y_hr_test))
        
        bestLambdaIndex=np.where(bestOfLambdaScore==np.max(bestOfLambdaScore))
        softImpute=SoftImpute(shrinkage_value=lambda_0*scale[bestLambdaIndex])
        
        X_MBF_imputed_soft_normalized=softImpute.fit_transform(X_MBF_soft_normalized)
        X_MBF_imputed_soft=biscaler.inverse_transform(X_MBF_imputed_soft_normalized)
        
        softImputeMBFScore=np.append(softImputeMBFScore,np.max(bestOfLambdaScore))
    
    #do the feature selection thing
    softImpute_MBF_feature=importantFeatures(bestModel,X_MBF_imputed_soft,y_hr_big_train,BS_size_big)
    softImputed_MBF_featureIndex,softImputed_MBF_occurrence=featureImportance(softImpute_MBF_feature)
    
    print('There are',len(mostCommonFeatures(softImputed_MBF_occurrence)),'important features')
    print(mostCommonFeatures(softImputed_MBF_occurrence))
print(softImputeMBFScore)

There are 13 important features
[ 14.  31.  33.  49.  68.  73.  97. 110.  81.  92. 116. 120. 134.]
There are 16 important features
[  4.   6.   7.  11.  19.  39.  52.  67.  70. 110.  81.  92. 120. 134.
 135. 140.]
There are 12 important features
[ 21.  22.  33.  39.  49.  67. 102. 110.  81.  92. 116. 120.]
There are 16 important features
[ 22.  25.  43.  49.  61.  70.  89.  96.  97. 101. 110.  81.  92. 116.
 120. 142.]
There are 12 important features
[  1.   5.   9.  22. 110.  79.  81.  92. 105. 116. 120. 130.]
There are 18 important features
[  3.   5.   6.   7.   8.  19.  39.  47.  49.  89.  97. 110.  81.  92.
 113. 116. 120. 134.]
There are 16 important features
[  2.   4.   5.   6.   7.   9.  39.  43.  53.  97. 110.  81.  92. 116.
 120. 130.]
There are 12 important features
[  7.   8.  22.  49.  97. 110.  81.  92. 116. 118. 120. 134.]
There are 14 important features
[ 21.  31.  39.  49.  70.  82.  90.  97. 110. 127.  81.  83.  92. 120.]
There are 11 important features
[  7.  33.  9

In [33]:
# random missing and mean impute
imputeScore=np.array([])
for i in range(10): 
    
    missing_random=compRandomFunc(X_hr_big_train,probability)
    imputed_random=imputetMissing(missing_random)
    
    #doing the predective performance thing
    bestModel.fit(imputed_random,y_hr_big_train)
    imputeScore=np.append(imputeScore,bestModel.score(X_hr_test,y_hr_test))
    
    #do the feature selection thing
    imputed_feature=importantFeatures(bestModel,imputed_random,y_hr_big_train,BS_size_big)
    imputed_featureIndex,imputed_occurrence=featureImportance(imputed_feature)
    print('There are',len(mostCommonFeatures(imputed_occurrence)),'important features')
    print(mostCommonFeatures(imputed_occurrence))
    
print(imputeScore)

There are 11 important features
[ 39.  43.  49.  70.  97. 110.  81.  92. 116. 120. 130.]
There are 10 important features
[ 19.  39.  43.  49.  97. 110.  81.  92. 116. 120.]
There are 9 important features
[ 19.  39.  49.  97. 110.  81.  92. 116. 120.]
There are 11 important features
[ 19.  39.  49.  97. 110.  81.  92. 106. 116. 120. 134.]
There are 9 important features
[ 39.  49.  97. 110.  81.  84.  92. 116. 120.]
There are 9 important features
[ 39.  49.  97. 110.  81.  92. 116. 120. 131.]
There are 9 important features
[ 39.  43.  49.  97. 110.  81.  92. 116. 120.]
There are 9 important features
[ 39.  49.  97. 110.  81.  92. 116. 120. 130.]
There are 9 important features
[ 39.  43.  49.  97. 110.  81.  92. 116. 120.]
There are 7 important features
[ 39.  97. 110.  81.  92. 116. 120.]
[0.30221332 0.29390071 0.28487832 0.28605767 0.29204145 0.29054609
 0.28945875 0.30015482 0.29657769 0.29846784]


In [34]:
MBF_imputeScore=np.array([])

for i in range(10):
    MBF_missing=missingByFeature(X_hr_big_train,p1,p2)
    MBF_imputed=imputetMissing(MBF_missing)
    
    #doing the predective performace thing
    bestModel.fit(MBF_imputed,y_hr_big_train)
    MBF_imputeScore=np.append(MBF_imputeScore,bestModel.score(X_hr_test,y_hr_test))
    
    #do the feature selection thing
    imputed_feature_MBF=importantFeatures(bestModel,MBF_imputed,y_hr_big_train,BS_size_big)
    imputed_MBF_Index,imputed_MBF_occurrence=featureImportance(imputed_feature_MBF)
    
    print('There are',len(mostCommonFeatures(imputed_MBF_occurrence)),'important features')
    print(mostCommonFeatures(imputed_occurrence))
print(MBF_imputeScore)

There are 12 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 12 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 13 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 12 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 11 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 11 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 11 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 13 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 8 important features
[ 39.  97. 110.  81.  92. 116. 120.]
There are 12 important features
[ 39.  97. 110.  81.  92. 116. 120.]
[0.27584298 0.28398105 0.28944321 0.28404941 0.27537315 0.2787337
 0.28339885 0.28706989 0.28912546 0.2873552 ]
