In [2]:
import os
import sys
import argparse
import logging
from sklearn.decomposition import PCA

logging.captureWarnings(True)

import matplotlib.pyplot as plt
import numpy as np
import torch

sys.path.append('..')
from mdlearn import fitting, visualize, metrics, preprocessing, validation, dataloader
from scripts.train_pca import pca_nd

In [27]:
from matplotlib import pyplot as plt

In [3]:
def load_data(opt):
    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))

    datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm)

    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        selector.load(opt.part)
    else:
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
        
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()
    
    scaler = preprocessing.Scaler()
    scaler.fit(trainx)
    scaler.save(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)
    model = fitting.TorchMLPRegressor(None, None, [],
                                      is_gpu= False,
                                      )
    model.load(opt.output + '/model.pt')
    if  opt.pca != -1:
        normed_trainx, normed_validx, _ = pca_nd(normed_trainx, normed_validx, len(normed_trainx[0]) - opt.pca)
    return normed_validx, validy, model

In [4]:
# parameters
class temp():
    def __init__(self):
        pass
    
opt = temp()
opt.layer = "32, 16, 8"
opt.output = 'run/CH-cp/out'
opt.featrm = ''
opt.input = 'data/result-ML-CH-cp.txt'
opt.target = 'cp'
opt.fp = 'run/CH-cp/fp/fp_morgan1,run/CH-cp/fp/fp_simple'
opt.part = 'run/CH-cp/out/part-1.txt'
opt.pca = -1

In [5]:
validx, validy, model = load_data(opt)

# sobel analysis

In [9]:
import pickle
import numpy as np
from SALib.sample import saltelli
from SALib.analyze import sobol

def sobol_analyze(model, X, N=5000, **kwargs):
    ''' INPUT:
            ::model::  the NN that takes X as input,
            ::X:: a 2D vector of input [batch_size, feature_length] 
            ::N:: the analyze point size, see more detail in SALib
            ::**kwargs:: specify optional kwargs for 'sobel.analyze(**kwargs)'
        RETURN:  
            a dict of sobol importance analysis
        '''
    upper = X.max(axis=0)
    lower = X.min(axis=0)
    bounds = [[i, j] for i,j in zip(lower, upper)] 
    all_feature_problem = {
        'num_vars': X.shape[1],
        'names': ['x'+str(i) for i in range(X.shape[1])],
        'bounds': bounds
    }
    print("Generate feature samples")
    params_value = saltelli.sample(all_feature_problem, N)
    y_est = model.predict_batch(params_value).flatten()
    print("Start analyze")
    result = sobol.analyze( all_feature_problem, y_est, **kwargs )

    return result

In [14]:
import pickle
# from scripts.sobol_analyze import sobol_analyze
import sys
sys.path.append('..')
from mdlearn import fitting

# load data and model
result = sobol_analyze(model, validx[:,:-2], 1000)
print(result['S1'])

Generate feature samples


RuntimeError: size mismatch, m1: [246000 x 122], m2: [124 x 32] at c:\a\w\1\s\tmp_conda_3.6_091443\conda\conda-bld\pytorch_1544087948354\work\aten\src\th\generic/THTensorMath.cpp:940

In [None]:
len(result['S1'])

In [23]:
sobel_idx = np.argsort(result['S1'][:-2])

In [25]:
result['S1'][sobel_idx[-10:]]

array([0.02550286, 0.02661847, 0.02773388, 0.02936743, 0.02958369,
       0.03498374, 0.03671077, 0.04239588, 0.04514189, 0.11359062])

In [26]:
sobel_idx

array([ 96,  42,  57,  93,  16,  80,  84,  22,  44,   3,  30,   8,  45,
         9, 121,   2,   5, 110, 107,  27,  72,  89,  78,  81,  58,  43,
        36,  60,  90,  55, 119, 115,  48, 118,  14, 114,  41,  25, 120,
        71,  35, 111,  23,  99,  50,  20,  53,  12,   1,  73,  61,  28,
        65,  15,  94,  77,  87,   6, 101,  74,  64,   0, 117,  56,  17,
        97,  47,  46,  37,  98,  26, 106,  21,  59,  69,  62,  33,  38,
       108, 100, 102,  95,  70, 104,  49,  10, 105,  11,  86,  34,  82,
        13,  88,  63,   4,  19,  40,  85, 103, 109, 112,  75,  29,  39,
        83,  54,  91,  76,  24,  52,  51,  66, 113,  18,  79,   7, 116,
        32,  67,  92,  68,  31], dtype=int64)

In [22]:
result['S1'][:-2]

array([ 2.47133811e-03,  1.19037823e-03, -5.15625540e-04, -8.77526727e-04,
        9.80286408e-03, -4.78849535e-04,  1.96234446e-03,  2.93674314e-02,
       -6.59232209e-04, -5.78205036e-04,  6.61478472e-03,  7.78542624e-03,
        1.17918279e-03,  8.40572093e-03,  3.76514920e-04,  1.74594321e-03,
       -2.26684037e-03,  2.84866346e-03,  2.66184734e-02,  9.93974400e-03,
        1.06605667e-03,  3.81377299e-03, -1.42386867e-03,  8.10246510e-04,
        1.76166662e-02,  5.40939951e-04,  3.43309795e-03, -2.69739507e-04,
        1.55028842e-03,  1.24712158e-02, -6.95968651e-04,  1.13590624e-01,
        3.49837444e-02,  4.84222951e-03,  8.26323476e-03,  6.05888715e-04,
        4.27158539e-05,  3.38814299e-03,  5.10502436e-03,  1.34158522e-02,
        1.00674914e-02,  4.68961733e-04, -3.62891703e-03,  4.04273749e-05,
       -1.13015689e-03, -6.30520965e-04,  3.16721013e-03,  2.90068764e-03,
        2.88381528e-04,  6.48894882e-03,  9.55515245e-04,  2.43223525e-02,
        2.19923889e-02,  

In [None]:
def sobol_reduce(X, X_valid, n)
    X_, X_valid_ = X[:,:-2], X_valid[:,:-2]
    X_, X_valid_ = X_[sobol_idx[-n:]], X_valid_[sobol_idx[-n:]]
    X = np.c_[X_, X[:,-2:]]
    X_valid = np.c_[X_valid_, X_valid[:,-2:]]
    return X, X_valid