In [1]:
import os
import sys
import argparse
import logging
import shutil
from sklearn.decomposition import PCA
import pickle
import pandas as pd

logging.captureWarnings(True)

import matplotlib

if sys.platform == 'linux':
    print('Use non-interactive Agg backend for matplotlib on linux')
    matplotlib.use('Agg')

import matplotlib.pyplot as plt
import numpy as np
import torch

sys.path.append('..')
from mdlearn import fitting, visualize, metrics, preprocessing, validation, dataloader

In [2]:
class OPT():
    def __init__(self):
        pass
    
opt = OPT()
opt.input = '../data/result-ML-CH-cp.txt'
opt.target = 'cp'
opt.fp = '../run/CH-cp/fp/fp_morgan1,../run/CH-cp/fp/fp_simple'
opt.featrm = ''
opt.part = '../run/CH-cp/out/part-1.txt'
opt.output = '../run/CH-cp/out'
opt.layer = '32,16,8'

In [5]:
def load_data(opt):
    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))

    datax, datay, data_names = dataloader.load(filename=opt.input, target=opt.target, fps=opt.fp.split(','), featrm=featrm)

    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        selector.load(opt.part)
    else:
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
        
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()
    
    #scaler = preprocessing.Scaler()
    #scaler.fit(trainx)
    #scaler.save(opt.output + '/scale.txt')
    #normed_trainx = scaler.transform(trainx)
    #normed_validx = scaler.transform(validx)
    #model = fitting.TorchMLPRegressor(None, None, [],
    #                                  is_gpu= False,
    #                                  )
    #model.load(opt.output + '/model.pt')
    return trainx, validx, trainy, validy

In [7]:
trainx, validx, trainy, validy= load_data(opt)

In [8]:
selection = 28
[unique_trainx, idx] = np.unique(trainx[:,:-2],axis=0,return_index=True)
appear_count = (unique_trainx != 0).sum(axis=0)
appear_sort = np.argsort(appear_count)
trainx = trainx[:, appear_sort[-selection: ]]
validx = validx[:, appear_sort[-selection: ]]

In [9]:
appear_count

array([  94,  389,  336,  136,   83,   69,   92,  546,  190,  114,  667,
        105,   81,  125,  233,  196,  315,  475,   93,   86,  190,  393,
        113,  267,  182,   86,   98,  103,  509,  126,  465,  941,  552,
       1504,  336,  454, 2095,  352,  102,  236,  872,   92,  644,  208,
        118,  137,  144,  123,  814,  828,  825,  508, 1207,  120,  263,
        155,  179,  123, 2309, 2309, 2309, 2174,  275,    7,  119,   66,
         71,  283,  520,   66,   31,  595])

In [10]:
appear_sort

array([63, 70, 69, 65,  5, 66, 12,  4, 19, 25, 41,  6, 18,  0, 26, 38, 27,
       11, 22,  9, 44, 64, 53, 47, 57, 13, 29,  3, 45, 46, 55, 56, 24,  8,
       20, 15, 43, 14, 39, 54, 23, 62, 67, 16, 34,  2, 37,  1, 21, 35, 30,
       17, 51, 28, 68,  7, 32, 71, 42, 10, 48, 50, 49, 40, 31, 52, 33, 36,
       61, 59, 58, 60], dtype=int64)

In [6]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.ensemble import AdaBoostRegressor

# create a base classifier used to evaluate a subset of attributes
model = AdaBoostRegressor()

# create the RFE model and select 3 attributes
rfe = RFE(model, verbose=1)
rfe = rfe.fit(normed_trainx, trainy)

# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.


KeyboardInterrupt: 

In [None]:
normed_trainx.shape

In [None]:
normed_validx.shape