In [39]:
import numpy as np
from numpy import load
import random
from sklearn.linear_model import Ridge
from fancyimpute import SoftImpute, BiScaler
from sklearn.linear_model import LinearRegression as reg
from sklearn.linear_model import ElasticNetCV
import itertools
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
import warnings
import os, sys
warnings.filterwarnings('ignore')

data=np.load('august2020-exercise1.npz')

#print(data.files) # view the files in the npz

#function to normalize the data
def normalize(matrix):
    normed=(matrix-matrix.mean(axis=0))/matrix.std(axis=0)
    return normed

######## train data ########
X_lr_small_train=normalize(data['X_lr_small_train'])
X_lr_big_train=normalize(data['X_lr_big_train'])
X_hr_small_train=normalize(data['X_hr_small_train'])
X_hr_big_train=normalize(data['X_hr_big_train'])

y_lr_small_train=normalize(data['y_lr_small_train'])
y_lr_big_train=normalize(data['y_lr_big_train'])
y_hr_small_train=normalize(data['y_hr_small_train'])
y_hr_big_train=normalize(data['y_hr_big_train'])
######## test data#########
X_lr_test=normalize(data['X_lr_test'])
X_hr_test=normalize(data['X_hr_test'])

y_lr_test=normalize(data['y_lr_test'])
y_hr_test=normalize(data['y_hr_test'])

def compRandomFunc(array,probability): # probability=0.7
    n=array.shape[0]
    p=array.shape[1]
    array=array.astype("float")
    for i in range(n):    
        for j in range(p):
            r=random.uniform(0, 1)
            if r < probability:
                array[i,j]=np.NaN
    return array

def missingByFeature(array,p1,p2): #p1=0.875, p2=0.8
    n=array.shape[0]
    p=array.shape[1]
    array=array.astype("float")
    featureList=np.array([])
    for i in range(p):
        r = random.uniform(0, 1)
        if r<p1:
            featureList=np.append(featureList,i)
    for i in featureList:  # i is the counter for feature not sample size
        i=int(i)
        for j in range(n):
            r2=random.uniform(0,1)
            if r2<p2:
                array[j,i]=np.NaN
    return array

def meanImputation(feature,data):
    imputed=np.nanmean(data[:,feature])
    return imputed

# creating data with missing values
import math
probability=0.7
p1=0.875
p2=0.8
# return the imputed matrix with mean imputation
def imputetMissing(matrix):
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            if math.isnan(matrix[i,j]) is True:
                matrix[i,j]=meanImputation(j,matrix)
    return matrix

def test(models, X_train,y_train,X_test,y_test, iterations = 100):
    results = {}
    for i in models:
        r2_train = []
        r2_test = []
        for j in range(iterations):
            r2_test.append(metrics.r2_score(y_test,
                                            models[i].fit(X_train, 
                                                         y_train).predict(X_test)))

            r2_train.append(metrics.r2_score(y_train, 
                                             models[i].fit(X_train, 
                                                          y_train).predict(X_train)))
        results[i] = [np.mean(r2_train),np.mean(r2_test)]
    return pd.DataFrame(results)

def importantFeatures(best_model,X_train,y_train,BS_size):
    importantFeatures=np.array([])
    
    for i in range(100):
        #creatig bootstrap boots
        x_bag=np.empty((0,X_train.shape[1]))
        y_bag=np.array([])
        for j in range(BS_size): 
            randIndex=np.random.randint(len(X_train), size=1)
            x_sample=X_train[randIndex,:]
            y_sample=y_train[randIndex]
                
            x_bag=np.concatenate((x_bag,x_sample))
            y_bag=np.append(y_bag,y_sample)             
        # test the estimator on bootstrap sample 100 times
        #skippa grid search och kör dirr på bästa hyperparametrarna för att spara tid
        bestEstimator=best_model.fit(x_bag,y_bag)
        
        rfeBest=RFE(bestEstimator)
        X_rfe=rfeBest.fit_transform(x_bag,y_bag)
        bestEstimator.fit(X_rfe,y_bag)
        
        importantFeatures=np.append(importantFeatures,np.where(rfeBest.ranking_==1))
        
    return importantFeatures

def featureImportance(featureList):
    unique_elements, counts_elements = np.unique(featureList, return_counts=True)
    return unique_elements, counts_elements

def mostCommonFeatures(occurrence):
    mostCommonIndex=np.array([])
    for i in range(len(occurrence)):
        if occurrence[i]>=95:
            mostCommonIndex=np.append(mostCommonIndex,[i])
    mostCommonIndex=np.array(mostCommonIndex,dtype=np.int8)
    mostCommonFeatures=featureIndex[mostCommonIndex]
    return mostCommonFeatures

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout



In [30]:
models = {'OLS': linear_model.LinearRegression(),
         'Lasso': linear_model.Lasso(),
         'Ridge': linear_model.Ridge(),}

lasso_params = {'alpha':[0.02, 0.024, 0.025, 0.026, 0.03]}
ridge_params = {'alpha':[200, 250, 300, 400, 500]}
eNet_params={'alpha': [0, 0.5, 0.1, 0.01, 0.001],
             'l1_ratio': [0, 0.25, 0.5, 0.75, 1]}

models1 = {
           'Lasso': GridSearchCV(linear_model.Lasso(), 
                               param_grid=lasso_params).fit(X_lr_small_train,y_lr_small_train).best_estimator_,
           'Ridge': GridSearchCV(linear_model.Ridge(), 
                               param_grid=ridge_params).fit(X_lr_small_train,y_lr_small_train).best_estimator_,
          'Elastic net':GridSearchCV(linear_model.ElasticNet(), 
                               param_grid=eNet_params).fit(X_lr_small_train,y_lr_small_train).best_estimator_,
         'OLS': linear_model.LinearRegression()}

print('low rate small data')
test(models1, X_lr_small_train,y_lr_small_train,X_lr_test,y_lr_test)

print(GridSearchCV(linear_model.ElasticNet(),param_grid=eNet_params).fit(X_lr_small_train,y_lr_small_train).best_estimator_)


low rate small data
ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.25,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)


In [49]:
bestModel=ElasticNet(l1_ratio=0.25,alpha=0.1)
bestModel.fit(X_lr_small_train,y_lr_small_train)
print('Best model score:',bestModel.score(X_lr_test,y_lr_test))
#print('Best model coefficients:',bestModel.coef_)

Best model score: 0.3019142149657966


In [50]:
bestModel.coef_

array([ 0.        ,  0.        , -0.        ,  0.        , -0.        ,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
       -0.        ,  0.        ,  0.01892819, -0.        , -0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        , -0.        , -0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
        0.        ,  0.        , -0.02549994, -0.        ,  0.        ,
       -0.        ,  0.        , -0.        ,  0.        , -0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.        ,
        0.        ,  0.        ,  0.        , -0.01354898,  0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        , -0.        ,  0.        ,  0.        ,
       -0.02263579,  0.        , -0.01259263,  0.        ,  0.  

In [44]:
BS_size_small= 100
feature_lr_small=importantFeatures(bestModel,X_lr_small_train,y_lr_small_train,BS_size_small)

In [48]:
mostCommonFeatures(occurrence)

array([33., 34., 35., 36., 37., 38., 39.])

In [46]:
featureIndex,occurrence=featureImportance(feature_lr_small)
len(mostCommonFeatures(occurrence))

7

In [40]:
#random missing and softImpute
softImpute = SoftImpute()
biscaler = BiScaler()
softImputeScore=np.array([])
scale=np.array([0.1,0.25,0.5,0.75])

for i in range(10):
    with HiddenPrints():
        X_randomMissingSoft=compRandomFunc(X_lr_small_train,probability) #here is the line where you enable missing data
        X_randomMissingSoft_normalized = biscaler.fit_transform(X_randomMissingSoft)
        
        lambda_0=np.nanmax(X_randomMissingSoft_normalized)
        bestOfLambdaScore=np.array([])
        for j in range(len(scale)):
            softImpute=SoftImpute(shrinkage_value=lambda_0*scale[j])
        
            X_imputed_soft_normalized = softImpute.fit_transform(X_randomMissingSoft_normalized)
            X_imputed_soft = biscaler.inverse_transform(X_imputed_soft_normalized)
            
            bestModel.fit(X_imputed_soft,y_lr_small_train)
            bestOfLambdaScore=np.append(bestOfLambdaScore,bestModel.score(X_lr_test,y_lr_test))
        
        bestLambdaIndex=np.where(bestOfLambdaScore==np.max(bestOfLambdaScore))
        softImpute=SoftImpute(shrinkage_value=lambda_0*scale[bestLambdaIndex])
        X_imputed_soft_normalized = softImpute.fit_transform(X_randomMissingSoft_normalized)
        X_imputed_soft = biscaler.inverse_transform(X_imputed_soft_normalized)
        
        softImputeScore=np.append(softImputeScore,np.max(bestOfLambdaScore))
        
   #do the feature selection thing
    softImpute_feature=importantFeatures(bestModel,X_imputed_soft,y_lr_small_train,BS_size_small)
    softImputed_featureIndex,softImputed_occurrence=featureImportance(softImpute_feature)
    print('There are',len(mostCommonFeatures(softImputed_occurrence)),'important features',flush=True)
    print(mostCommonFeatures(softImputed_occurrence),flush=True)
print(softImputeScore,flush=True)

There are 15 important features
[29. 30. 31. 32. 33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44.]
There are 16 important features
[37. 39. 40. 41. 42. 43. 44. 48. 60. 61. 62. 63. 64. 65. 66. 68.]
There are 14 important features
[33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 45. 60. 65.]
There are 16 important features
[30. 31. 32. 33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 60. 65.]
There are 7 important features
[37. 39. 40. 41. 42. 43. 47.]
There are 6 important features
[40. 41. 42. 43. 44. 65.]
There are 9 important features
[39. 40. 41. 42. 43. 44. 45. 47. 48.]
There are 18 important features
[33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 45. 47. 48. 60. 61. 62. 65.]
There are 4 important features
[43. 44. 45. 65.]
There are 14 important features
[29. 30. 31. 32. 33. 34. 35. 36. 37. 39. 40. 41. 42. 43.]
[0.30830819 0.30395516 0.30417196 0.30256809 0.3075016  0.30291221
 0.30661685 0.30036255 0.30442489 0.30593852]


In [41]:
#softImpute with missing by feature
softImputeMBFScore=np.array([])
for i in range(10):
    with HiddenPrints():
        X_MBF_soft=missingByFeature(X_lr_small_train,p1,p2)
        X_MBF_soft_normalized=biscaler.fit_transform(X_MBF_soft)
        
        lambda_0=np.nanmax(X_randomMissingSoft_normalized)
        bestOfLambdaScore=np.array([])
        for j in range(len(scale)):
            softImpute=SoftImpute(shrinkage_value=lambda_0*scale[j])
            
            X_MBF_imputed_soft_normalized=softImpute.fit_transform(X_MBF_soft_normalized)
            X_MBF_imputed_soft=biscaler.inverse_transform(X_MBF_imputed_soft_normalized)
            
            bestModel.fit(X_MBF_imputed_soft,y_lr_small_train)
            bestOfLambdaScore=np.append(bestOfLambdaScore,bestModel.score(X_lr_test,y_lr_test))
        
        bestLambdaIndex=np.where(bestOfLambdaScore==np.max(bestOfLambdaScore))
        softImpute=SoftImpute(shrinkage_value=lambda_0*scale[bestLambdaIndex])
        
        X_MBF_imputed_soft_normalized=softImpute.fit_transform(X_MBF_soft_normalized)
        X_MBF_imputed_soft=biscaler.inverse_transform(X_MBF_imputed_soft_normalized)
        
        softImputeMBFScore=np.append(softImputeMBFScore,np.max(bestOfLambdaScore))
    
    #do the feature selection thing
    softImpute_MBF_feature=importantFeatures(bestModel,X_MBF_imputed_soft,y_lr_small_train,BS_size_small)
    softImputed_MBF_featureIndex,softImputed_MBF_occurrence=featureImportance(softImpute_MBF_feature)
    
    print('There are',len(mostCommonFeatures(softImputed_MBF_occurrence)),'important features')
    print(mostCommonFeatures(softImputed_MBF_occurrence))
print(softImputeMBFScore)

There are 14 important features
[32. 33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 60. 65.]
There are 13 important features
[31. 32. 33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44.]
There are 12 important features
[ 34.  35.  36.  37.  39.  40.  41.  42.  43.  44. 144.  60.]
There are 12 important features
[33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 70.]
There are 20 important features
[27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 45.
 47. 70.]
There are 17 important features
[18. 19. 21. 22. 23. 24. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36.]
There are 14 important features
[33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 45. 65. 84.]
There are 13 important features
[33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 45. 47.]
There are 12 important features
[33. 34. 35. 36. 37. 39. 40. 41. 42. 43. 44. 45.]
There are 7 important features
[40. 41. 42. 43. 44. 47. 65.]
[0.30204085 0.30543181 0.30483037 0.30852851 0.3047836  0.30631347
 0.30670643 0.3067765  0.30893385 0.3100666 ]

In [42]:
# random missing and mean impute
imputeScore=np.array([])
for i in range(10): 
    
    missing_random=compRandomFunc(X_lr_small_train,probability)
    imputed_random=imputetMissing(missing_random)
    
    #doing the predective performance thing
    bestModel.fit(imputed_random,y_lr_small_train)
    imputeScore=np.append(imputeScore,bestModel.score(X_lr_test,y_lr_test))
    
    #do the feature selection thing
    imputed_feature=importantFeatures(bestModel,imputed_random,y_lr_small_train,BS_size_small)
    imputed_featureIndex,imputed_occurrence=featureImportance(imputed_feature)
    print('There are',len(mostCommonFeatures(imputed_occurrence)),'important features')
    print(mostCommonFeatures(imputed_occurrence))
    
print(imputeScore)

There are 11 important features
[11. 12. 13. 15. 16. 17. 18. 19. 21. 22. 23.]
There are 10 important features
[ 9. 11. 12. 13. 15. 16. 17. 18. 19. 21.]
There are 9 important features
[12. 13. 15. 16. 17. 18. 19. 21. 22.]
There are 11 important features
[ 9. 11. 12. 13. 15. 16. 17. 18. 19. 21. 22.]
There are 10 important features
[ 9. 11. 12. 13. 15. 16. 17. 18. 19. 21.]
There are 9 important features
[12. 13. 15. 16. 17. 18. 19. 21. 23.]
There are 11 important features
[ 8.  9. 11. 12. 13. 15. 16. 17. 18. 19. 21.]
There are 9 important features
[11. 12. 13. 15. 16. 17. 18. 19. 21.]
There are 12 important features
[ 8.  9. 11. 12. 13. 15. 16. 17. 18. 19. 21. 22.]
There are 10 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
[-0.07266854 -0.0692111  -0.04265043 -0.0690027   0.03324835 -0.1615104
 -0.03423722 -0.19126057 -0.02648263 -0.00697168]


In [43]:
MBF_imputeScore=np.array([])

for i in range(10):
    MBF_missing=missingByFeature(X_lr_small_train,p1,p2)
    MBF_imputed=imputetMissing(MBF_missing)
    
    #doing the predective performace thing
    bestModel.fit(MBF_imputed,y_lr_small_train)
    MBF_imputeScore=np.append(MBF_imputeScore,bestModel.score(X_lr_test,y_lr_test))
    
    #do the feature selection thing
    imputed_feature_MBF=importantFeatures(bestModel,MBF_imputed,y_lr_small_train,BS_size_small)
    imputed_MBF_Index,imputed_MBF_occurrence=featureImportance(imputed_feature_MBF)
    
    print('There are',len(mostCommonFeatures(imputed_MBF_occurrence)),'important features')
    print(mostCommonFeatures(imputed_occurrence))
print(MBF_imputeScore)

There are 14 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 13 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 15 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 12 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 14 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 16 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 12 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 11 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 15 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
There are 15 important features
[13. 15. 16. 17. 18. 19. 21. 22. 23. 27.]
[0.30413935 0.28745913 0.2972928  0.291745   0.27229778 0.29709856
 0.29682752 0.26692131 0.30170641 0.29271773]
