In [1]:
import os
import sys
import argparse
import logging
import shutil
from sklearn.decomposition import PCA
import pickle
import pandas as pd

logging.captureWarnings(True)

import matplotlib

if sys.platform == 'linux':
    print('Use non-interactive Agg backend for matplotlib on linux')
    matplotlib.use('Agg')

import matplotlib.pyplot as plt
import numpy as np
import torch

sys.path.append('..')
from mdlearn import fitting, visualize, metrics, preprocessing, validation, dataloader

In [2]:
class OPT():
    def __init__(self):
        pass
    
opt = OPT()
opt.input = '../data/C123-npt-1bar.txt'
opt.target = 'einter'
opt.fp = '../run/C123/fp/fp_morgan1,../run/C123/fp/fp_simple'
opt.featrm = ''
opt.part = '../run/C123/out/part-1.txt'
opt.output = '../run/C123/out'
opt.layer = '32,16,8'

In [19]:
def load_data(opt):
    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))

    datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm)

    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        selector.load(opt.part)
    else:
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
        
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()
    
    scaler = preprocessing.Scaler()
    scaler.fit(trainx)
    scaler.save(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)
    model = fitting.TorchMLPRegressor(None, None, [],
                                      is_gpu= False,
                                      )
    model.load(opt.output + '/model.pt')
    return normed_trainx, normed_validx, trainy, validy, model

In [24]:
normed_trainx, normed_validx, trainy, validy, model = load_data(opt)

In [26]:
from sklearn import datasets
from sklearn.feature_selection import RFECV
from sklearn.ensemble import AdaBoostRegressor

# create a base classifier used to evaluate a subset of attributes
model = AdaBoostRegressor()

# create the RFE model and select 3 attributes
rfe = RFECV(model, verbose=1)
rfe = rfe.fit(normed_trainx, trainy)

# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 fe

In [16]:
normed_trainx.shape

(15949, 43)

In [17]:
normed_validx.shape

(3987, 43)

In [29]:
a = np.array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  4,  7 , 9,  1, 12, 14,  1,  1,  1,  1,  3,  1,  1,  1,  1,  1,  1,  1,  1,6, 8,  5, 10, 11, 13,  1,  1,  1,  1,  1])

In [36]:
normed_trainx[:,a < 4].shape

(15949, 32)