In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
import copy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import SparsePCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestRegressor
import subprocess
from sklearn.model_selection import RandomizedSearchCV
import sys
import math
import scipy.spatial.distance
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.tree import DecisionTreeRegressor
import os
import shutil
import glob
import shap
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectFromModel
from boruta import BorutaPy
import BorutaShap

cmap='viridis'
sys.path.insert(1, '../')
from LibHelperFuncs import *

from iterative_spectral_method.src import *
from iterative_spectral_method.sdr import *

rstate = 0
np.random.seed(0)
njob = 20
global_trees = 500
mdepth = 10

p_thres_ur = 0.8
p_thres_r = 0.8 

plt.rcParams['axes.grid'] = False

ModuleNotFoundError: No module named 'boruta'

In [None]:
def compute_carried_shap(s_vals, comps, X):
    shap_values_r = np.arange(0, X.shape[0]).reshape(-1, 1)
    carried_shap_vals = np.apply_along_axis((lambda x : s_vals[x].reshape(-1, 1).T @ comps), 1, shap_values_r).reshape(-1, X.shape[1])
    return carried_shap_vals

def mean_carried_shap(s_vals, comps, X):
    
    sump = np.sqrt(np.mean(comps ** 2, axis=0))
    sump[np.where(sump == 0)[0]] = 1
    sump = sump ** 2
    sump[np.where(sump < 1e-8)[0]] = 1
    
    t = compute_carried_shap(s_vals, comps, X) / sump
    return np.mean(np.abs(t), axis=0) 


In [3]:
## Generate dataset and name of file here

fname = "Graphene_1"
filename = str('dmp/' + fname)

originalDat = np.genfromtxt("../../Data/data_bas_preprocessing.csv", delimiter=',')
originalDat_pd = pd.read_csv("../../Data/data_bas_preprocessing_pd.csv")

X = originalDat[:,:-1].T
np.random.shuffle(X)
X = X.T
# np.random.shuffle(X)
Y = originalDat[:,-1]
C = originalDat_pd.columns


In [4]:
raw_dat = pd.read_csv("../../Data/Graphene_Oxide_Nanoflake.csv")
dat = raw_dat.dropna(axis=1)
X = dat.iloc[:,1:-2].drop(['Shape'], axis=1)
X = scale_data(X).values
Y = raw_dat.iloc[:,-1].values

XY = np.append(X, Y.reshape(*Y.shape, 1), axis=1)
print(XY.shape)
np.savetxt("data_base.csv", XY, delimiter=',')

(776, 673)


In [5]:
def get_index_percentile(feature_importances, p):
    i = 1
    t = np.sum(feature_importances) * p
    s_features = np.sort(feature_importances)[::-1]
    while (np.sum(s_features[:i]) < t):
        i+=1
    return i

In [6]:
def boruta_get_X(X, Y):
    
    rf = RandomForestRegressor(n_jobs=-1, max_depth=mdepth)

    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

    # find all relevant features - 5 features should be selected
    feat_selector.fit(X, Y)

    # check selected features - first 5 features are selected
    feat_selector.support_

    # check ranking of features
    feat_selector.ranking_

    # call transform() on X to filter it down to selected features
    X_filtered = feat_selector.transform(X)
    
    return X_filtered

In [7]:
def feature_importance_get_X(X, Y):
    selector = SelectFromModel(estimator=RandomForestRegressor(n_estimators=global_trees, n_jobs=-1, max_depth=mdepth, random_state=rstate)).fit(X, Y)
    selector.get_support()
    X_trans = selector.transform(X)
    return X_trans

In [8]:
def shap_importance_get_X(X, Y):
    
    rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
    rf.fit(X, Y)
    
    exp = shap.TreeExplainer(rf)
    s_vals = exp.shap_values(X, approximate=True)
    feature_importances = np.mean(np.abs(s_vals), axis=0)

    f_to_keep = get_index_percentile(feature_importances, p_thres_ur)
    
    return X[:,np.argsort(feature_importances)[::-1][:f_to_keep]]

In [9]:
def shap_importance_reduced_get_X(X, Y):
    
    pca = PCA(n_components=50)
    Xr = pca.fit_transform(X)
    comps = pca.components_
    
    rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
    rf.fit(Xr, Y)
    
    exp = shap.TreeExplainer(rf)
    s_vals = exp.shap_values(Xr, approximate=True)
    feature_importances = mean_carried_shap(s_vals, comps, X)

    f_to_keep = get_index_percentile(feature_importances, p_thres_r)

    return X[:,np.argsort(feature_importances)[::-1][:f_to_keep]]

In [10]:
def original_model(X, Y):
    return X

In [11]:
def evaluate_model(func, X, Y):
    scores=dict()
    
    t1 = time.time()
    Xr = func(X, Y)
    t2 = time.time()
           
    scores['Time'] = t2 - t1
    
    rf=RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=rstate)
    
    rf.fit(Xr, Y)
    
    scores['RMSE'] = np.sqrt(np.mean((rf.predict(Xr) - Y) ** 2))
    
    cvscore = cross_val_score(rf, Xr, Y, cv=5, n_jobs=njob, scoring='neg_mean_squared_error')
    scores['CV'] = np.mean(np.abs(cvscore))
    
    scores['No. Features'] = Xr.shape[1]
    
    return scores

In [12]:
a = evaluate_model(shap_importance_get_X, X, Y)
a2 = evaluate_model(shap_importance_reduced_get_X, X, Y)
b = evaluate_model(feature_importance_get_X, X, Y)
c = evaluate_model(boruta_get_X, X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	672
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	672
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	672
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	672
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	672
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	672
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	672
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	16
Rejected: 	656
Iteration: 	9 / 100
Confirmed: 	6
Tentative: 	10
Rejected: 	656
Iteration: 	10 / 100
Confirmed: 	6
Tentative: 	10
Rejected: 	656
Iteration: 	11 / 100
Confirmed: 	6
Tentative: 	10
Rejected: 	656
Iteration: 	12 / 100
Confirmed: 	7
Tentative: 	7
Rejected: 	658
Iteration: 	13 / 100
Confirmed: 	7
Tentative: 	7
Rejected: 	658
Iteration: 	14 / 100
Confirmed: 	7
Tentative: 	7
Rejected: 	658
Iteration: 	15 / 100
Confirmed: 	7
Tentative: 	7
Rejected: 	658
Iteration: 	16 / 100
Confirmed: 	7
Tentative:

In [13]:
z = evaluate_model(original_model, X, Y)

In [14]:
df = pd.DataFrame()
df['Shap'] = a.values()
df['Shap Reduced'] = a2.values()
df['Feature Importance'] = b.values()
df['Boruta'] = c.values()
df['original'] = z.values()

df.index = a.keys()

In [15]:
df

Unnamed: 0,Shap,Shap Reduced,Feature Importance,Boruta,original
Time,10.873082,2.183274,4.542866,71.551598,0.0
RMSE,0.020036,0.019967,0.019995,0.01988,0.020234
CV,0.003028,0.003007,0.00303,0.002987,0.003086
No. Features,190.0,31.0,176.0,8.0,672.0


In [18]:
8.9 / 4.2

2.119047619047619