In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
import copy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import SparsePCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestRegressor
import subprocess
from sklearn.model_selection import RandomizedSearchCV
import sys
import math
import scipy.spatial.distance
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.tree import DecisionTreeRegressor
import os
import shutil
import glob
import shap


cmap='viridis'
sys.path.insert(1, '../')
from LibHelperFuncs import *

from iterative_spectral_method.src import *
from iterative_spectral_method.sdr import *

rstate = 0
np.random.seed(44164)
njob = 22

global_components = 50
global_trees = 100

steps = np.linspace(0.05, 1, 20)

In [2]:
originalDat = np.genfromtxt("../../Data/data_bas_preprocessing.csv", delimiter=',')
originalDat_pd = pd.read_csv("../../Data/data_bas_preprocessing_pd.csv")

X = originalDat[:,:-1].T
np.random.shuffle(X)
X = X.T
# np.random.shuffle(X)
Y = originalDat[:,-1]
C = originalDat_pd.columns

lower_thres = 1e-3

files = []
objectives = ['ISM_gaus_homogeneous_0.5.csv', 'pca_sparse_0.5.csv',\
              "ISM_poly_homogeneous_0.5.csv", 'nmf_H_matrix_homogeneous_0.5.csv', 'pca_homogeneous_0.5.csv']

In [3]:
rf_params = dict()
rf_params['n_estimators'] = global_trees

rf = RandomForestRegressor(**rf_params, n_jobs=njob, random_state=rstate)
rf.fit(X, Y)

print("L1 loss:", np.mean(np.abs((rf.predict(X) - Y))))
print("L2 loss:", np.sqrt(np.mean((rf.predict(X) - Y) ** 2)))
print("L3 loss:", np.cbrt(np.mean((rf.predict(X) - Y) ** 3)))
print("L infinity loss:", np.max(np.abs(rf.predict(X) - Y)))

score = cross_val_score(rf, X, Y, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
print("5 Fold CV L2 loss:", np.abs(np.mean(score)))

L1 loss: 0.0093842319109461
L2 loss: 0.013552134081123516
L3 loss: 0.008738410330718663
L infinity loss: 0.10654500000000588
5 Fold CV L2 loss: 0.0012638723009328584


In [4]:
removed_features = []

while (True):
    
    X_t = X
    for rmfeatures in removed_features:
        X_t = np.delete(X_t, rmfeatures, axis=1)

    rf = RandomForestRegressor(**rf_params, n_jobs=njob, random_state=rstate)
    rf.fit(X_t, Y)

    explainerp = shap.TreeExplainer(rf, feature_perturbation='tree_path_dependant')
    s_vals = explainerp.shap_values(X_t, approximate=True)

    ov_shap = np.mean(np.abs(s_vals), axis=0)
    removeable_features = np.where(ov_shap <= 1e-8)[0]
    
    if (len(removeable_features) == 0):
        break
    
    removed_features.append(removeable_features)
    
    print(removeable_features, X_t.shape)
    
X_t = X
for rmfeatures in removed_features:
    X_t = np.delete(X_t, rmfeatures, axis=1)

[  2   5  11  12  19  26  30  31  37  40  41  43  44  51  52  54  55  59
  60  63  70  72  73  77  78  87  89  93  99 105 107 113 116 117 118 127
 129 134 135 136 142 145 148 149 150 152 153 156 157 158 161 163 167 174
 180 181 184 185 187 188 190 191 192 196 202 205 210 214 216 221 223 224
 225 228 236 237 239 241 242 243 249 256 258 265 267 271 273 283 288 289
 290 303 305 307 310 313 316 319 320 321 325 326 330 345 346 347 348 352
 357 363 365 367 368 370 379] (1617, 390)
[ 36  62 187 250 274] (1617, 275)
[80] (1617, 270)
[58] (1617, 269)
[138] (1617, 268)


In [5]:
rf = RandomForestRegressor(n_estimators=global_trees, n_jobs=njob, random_state=rstate) 
# rf = LinearRegression()
rf.fit(X_t, Y)
print("L2:", np.sqrt(np.mean((rf.predict(X_t) - Y) ** 2)))
score = cross_val_score(rf, X_t, Y, cv=5, n_jobs=njob, scoring='neg_mean_squared_error')
print("5 Fold CV L2 loss:", np.abs(np.mean(score)))

L2: 0.013544791334037624
5 Fold CV L2 loss: 0.00126461949529529


In [None]:
most_list = []
stepcount = 150

sc = 10

X_l = X_t

most_important_features = []

for x in range(stepcount):
    
    rf = RandomForestRegressor(**rf_params, n_jobs=njob, random_state=rstate)
    rf.fit(X_l, Y)

    explainerp = shap.TreeExplainer(rf, feature_perturbation='tree_path_dependant')
    s_vals = explainerp.shap_values(X_l, approximate=True)
    
    ov_shap_vals = np.mean(np.abs(s_vals), axis=0)
    
    most_important_features = np.argsort(ov_shap_vals)[::-1][:sc]
    
    most_list.extend(list(most_important_features))
    
    for lllist in most_important_features:
        X_l = np.delete(X_l, lllist, axis=1)
        
    print(most_important_features)

In [24]:
X_l.shape

(1617, 162)

In [25]:
most_important_features

array([ 59, 151, 149, 100, 153, 165,  85, 120, 107,  51], dtype=int64)

In [27]:
s_vals.shape

(1617, 167)

In [22]:
X_l = np.delete(X_l, 0, axis=0)

In [None]:
X_k = []
cvscoreslist = []
l2scoreslist= []

X_l = X_t


counter = 0
for i in most_list:
    counter+=1
    if (counter % 10 == 0):
        print(counter)
    
    X_k.append(X_l[:,i])
    X_l = np.delete(X_l, i, axis=1)

    X_testing = np.array(X_k).T
    
    rf = RandomForestRegressor(n_estimators=global_trees, n_jobs=njob, random_state=rstate) 
    rf.fit(X_testing, Y)
    l2score = np.sqrt(np.mean((rf.predict(X_testing) - Y) ** 2))
    cvscore = cross_val_score(rf, X_testing, Y, cv=5, n_jobs=njob, scoring='neg_mean_squared_error')
    cvscore = np.abs(np.mean(cvscore))
    
    cvscoreslist.append(cvscore)
    l2scoreslist.append(l2score)

In [None]:
plt.figure(figsize=(14, 10))

num_removed = np.arange(sc, stepcount * sc + 1, sc)
plt.plot(num_removed, cvscoreslist / np.max(cvscoreslist), label = 'CV Score')
plt.plot(num_removed, l2scoreslist / np.max(l2scoreslist), label = 'L2 Score')

plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.legend(fontsize=24)

plt.ylabel("Normalised Score", fontsize=24)
plt.xlabel("Number of features", fontsize=24)

plt.legend(fontsize=24)

plt.savefig("../../Writeups/Figures/CSA_forwards_normalised_k5.pdf", bbox_inches='tight')

In [None]:
np.min(cvscoreslist)

In [None]:
np.argsort(l2scoreslist)

In [None]:
np.argsort(cvscoreslist)

In [None]:
X_k = []

X_l = X_t
counter = 0
for i in most_list:
    
    X_k.append(X_l[:,i])
    X_l = np.delete(X_l, i, axis=1)

    X_testing = np.array(X_k).T
    if counter == np.argmin(cvscoreslist):
        break
        
    counter+=1

In [None]:

X_k = []

X_l = X_t
counter = 0
for i in most_list:
    
    X_k.append(X_l[:,i])
    X_l = np.delete(X_l, i, axis=1)

    X_testing = np.array(X_k).T
    if counter == 6:
        break
        
    counter+=1

In [None]:
rf = RandomForestRegressor(n_estimators=global_trees, n_jobs=njob, random_state=rstate) 
# rf = LinearRegression()
rf.fit(X_testing, Y)
print("L2:", np.sqrt(np.mean((rf.predict(X_testing) - Y) ** 2)))
score = cross_val_score(rf, X_testing, Y, cv=5, n_jobs=njob, scoring='neg_mean_squared_error')
print("5 Fold CV L2 loss:", np.abs(np.mean(score)))

In [None]:
rf = RandomForestRegressor(n_estimators=5, random_state=rstate, n_jobs=-1)
rf.fit(X, Y)

exp = shap.TreeExplainer(rf, feature_perturbation='tree_path_dependant')
s_vals = exp.shap_values(X, approximate=True)
ov_shap = np.mean(np.abs(s_vals), axis=0)

X_v = X[:,np.argsort(ov_shap)[::-1][:X_testing.shape[1]]]

rf = RandomForestRegressor(n_estimators=global_trees, n_jobs=-1, random_state=rstate)

rf.fit(X_v, Y)
print("L2 loss:", np.sqrt(np.mean((rf.predict(X_v) - Y) ** 2)))
score = cross_val_score(rf, X_v, Y, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
print("5 Fold CV L2 loss:", np.abs(np.mean(score)))