In [11]:
# Group 33, Florida Atlantic University
# Implementation of Wangyang Ying, Dongjie Wang, Haifeng Chen, and Yanjie Fu. 2024. Feature Selection as Deep Sequential Generative Learning. 1, 1 (March 2024),

import os
import sys
import logging
import torch
import numpy as np
import pandas as pd
import pickle
import math
import copy
import argparse
import random
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from typing import List
from collections import namedtuple
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeClassifier, Ridge, Lasso, LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split

# Hyper parameters

In [12]:
sys.argv = [
    'train_controller.py',          
    '--task_name', 'miRNA',
    '--method_name', 'transformerVae',
    '--top_k', '5',
    '--pre_train', 'True',
]

parser = argparse.ArgumentParser()
# Basic model parameters.

parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--new_gen', type=int, default=200)
parser.add_argument('--method_name', type=str, choices=['rnn', 'transformer', 'transformerVae'], default='rnn')
parser.add_argument('--task_name', type=str, choices=['methelyene', 'miRNA'], default='miRNA')
parser.add_argument('--gpu', type=int, default=0, help='used gpu')
parser.add_argument('--fe', type=str, choices=['+', '', '-'], default='-')
parser.add_argument('--top_k', type=int, default=25)
parser.add_argument('--gen_num', type=int, default=25)
parser.add_argument('--encoder_layers', type=int, default=1)
parser.add_argument('--encoder_hidden_size', type=int, default=64)
parser.add_argument('--encoder_emb_size', type=int, default=32)
parser.add_argument('--mlp_layers', type=int, default=2)
parser.add_argument('--mlp_hidden_size', type=int, default=200)
parser.add_argument('--decoder_layers', type=int, default=1)
parser.add_argument('--decoder_hidden_size', type=int, default=64)
# parser.add_argument('--source_length', type=int, default=40)
# parser.add_argument('--encoder_length', type=int, default=20)
# parser.add_argument('--decoder_length', type=int, default=40)
parser.add_argument('--encoder_dropout', type=float, default=0)
parser.add_argument('--mlp_dropout', type=float, default=0)
parser.add_argument('--decoder_dropout', type=float, default=0)
parser.add_argument('--l2_reg', type=float, default=0.0)
# parser.add_argument('--encoder_vocab_size', type=int, default=12)
# parser.add_argument('--decoder_vocab_size', type=int, default=12)
parser.add_argument('--max_step_size', type=int, default=100)
parser.add_argument('--trade_off', type=float, default=0.8)
parser.add_argument('--epochs', type=int, default=200)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--lr', type=float, default=0.001)
parser.add_argument('--optimizer', type=str, default='adam')
parser.add_argument('--grad_bound', type=float, default=5.0)

parser.add_argument('--transformer_encoder_layers', type=int, default=2)
parser.add_argument('--encoder_nhead', type=int, default=8)
parser.add_argument('--encoder_embedding_size', type=int, default=64)
parser.add_argument('--transformer_encoder_dropout', type=float, default=0.1)
parser.add_argument('--transformer_encoder_activation', type=str, default='relu')
parser.add_argument('--encoder_dim_feedforward', type=int, default=128)
parser.add_argument('--batch_first', type=bool, default=True)  
parser.add_argument('--d_latent_dim', type=int, default=64)

parser.add_argument('--transformer_decoder_layers', type=int, default=2)
parser.add_argument('--decoder_nhead', type=int, default=8)
parser.add_argument('--transformer_decoder_dropout', type=float, default=0.1)
parser.add_argument('--transformer_decoder_activation', type=str, default='relu')
parser.add_argument('--decoder_dim_feedforward', type=int, default=128)
parser.add_argument('--decoder_embedding_size', type=int, default=64) 
parser.add_argument('--pre_train', type=str, default="True") 

args = parser.parse_args()

# Logging utils

In [13]:
def error(msg):
    logging.error(msg)
    print('ERROR: ', msg)

def info(msg):
    logging.info(msg)
    print('INFO: ', msg)

# Tools

In [14]:
def relative_absolute_error(y_test, y_predict):
    y_test = np.array(y_test)
    y_predict = np.array(y_predict)
    error = np.sum(np.abs(y_test - y_predict)) / np.sum(np.abs(np.mean(
        y_test) - y_test))
    return error

def test_task_new(Dg, task='cls', init_seed=0):
    X = Dg.iloc[:, :-1]
    y = Dg.iloc[:, -1].astype(float)
    if task == 'mcls':
        clf = OneVsRestClassifier(RandomForestClassifier(random_state=0))
        pre_list, rec_list, f1_list, maf1_list = [], [], [], []
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                            random_state=init_seed, shuffle=True)
        clf.fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        pre_list.append(precision_score(y_test, y_predict, average='macro'))
        rec_list.append(recall_score(y_test, y_predict, average='macro'))
        f1_list.append(f1_score(y_test, y_predict, average='micro'))
        maf1_list.append(f1_score(y_test, y_predict, average='macro'))
        return np.mean(pre_list), np.mean(rec_list), np.mean(f1_list), np.mean(maf1_list)
    else:
        return -1
    
def downstream_task_new(data, task_type):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1].astype(float)
    if task_type == 'mcls':
        clf = OneVsRestClassifier(RandomForestClassifier(random_state=0, n_jobs=128))
        pre_list, rec_list, f1_list, auc_roc_score = [], [], [], []
        skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
        for train, test in skf.split(X, y):
            X_train, y_train, X_test, y_test = X.iloc[train, :], y.iloc[train], X.iloc[test, :], y.iloc[test]
            clf.fit(X_train, y_train)
            y_predict = clf.predict(X_test)
            f1_list.append(f1_score(y_test, y_predict, average='micro'))
        return np.mean(f1_list)
    else:
            return -1
        

def downstream_task_by_method_std(data, task_type, method):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1].astype(float)
    if method == 'RF':
        if task_type == 'cls':
            model = RandomForestClassifier(random_state=0, n_jobs=128)
        elif task_type == 'mcls':
            model = OneVsRestClassifier(RandomForestClassifier(random_state=0), n_jobs=128)
        else:
            model = RandomForestRegressor(random_state=0, n_jobs=128)
    elif method == 'XGB':
        if task_type == 'cls':
            model = XGBClassifier(eval_metric='logloss', n_jobs=128)
        elif task_type == 'mcls':
            model = OneVsRestClassifier(XGBClassifier(eval_metric='logloss'), n_jobs=128)
        else:
            model = XGBRegressor(eval_metric='logloss', n_jobs=128)
    elif method == 'SVM':
        if task_type == 'cls':
            model = LinearSVC()
        elif task_type == 'mcls':
            model = LinearSVC()
        else:
            model = LinearSVR()
    elif method == 'KNN':
        if task_type == 'cls':
            model = KNeighborsClassifier(n_jobs=128)
        elif task_type == 'mcls':
            model = OneVsRestClassifier(KNeighborsClassifier(), n_jobs=128)
        else:
            model = KNeighborsRegressor(n_jobs=128)
    elif method == 'Ridge':
        if task_type == 'cls':
            model = RidgeClassifier()
        elif task_type == 'mcls':
            model = OneVsRestClassifier(RidgeClassifier(), n_jobs=128)
        else:
            model = Ridge()
    elif method == 'LASSO':
        if task_type == 'cls':
            model = LogisticRegression(penalty='l1',solver='liblinear', n_jobs=128)
        elif task_type == 'mcls':
            model = OneVsRestClassifier(LogisticRegression(penalty='l1',solver='liblinear'), n_jobs=128)
        else:
            model = Lasso()
    else:  # dt
        if task_type == 'cls':
            model = DecisionTreeClassifier()
        elif task_type == 'mcls':
            model = OneVsRestClassifier(DecisionTreeClassifier(), n_jobs=128)
        else:
            model = DecisionTreeRegressor()

    if task_type == 'mcls':
        clf = OneVsRestClassifier(RandomForestClassifier(random_state=0))
        pre_list, rec_list, f1_list, auc_roc_score = [], [], [], []
        skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
        for train, test in skf.split(X, y):
            X_train, y_train, X_test, y_test = X.iloc[train, :], y.iloc[train], X.iloc[test, :], y.iloc[test]
            clf.fit(X_train, y_train)
            y_predict = clf.predict(X_test)
            f1_list.append(f1_score(y_test, y_predict, average='micro'))
        return np.mean(f1_list), np.std(f1_list)
    else:
        return -1

# Utils

In [15]:
class AvgrageMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.cnt = 0

    def update(self, val, n=1):
        self.sum += val * n
        self.cnt += n
        self.avg = self.sum / self.cnt
      
def pairwise_accuracy(la, lb):
    print(la)
    n = len(la)
    assert n == len(lb)
    total = 0
    count = 0
    for i in range(n):
        for j in range(i+1, n):
            if la[i] >= la[j] and lb[i] >= lb[j]:
                count += 1
            if la[i] < la[j] and lb[i] < lb[j]:
                count += 1
            total += 1
    return float(count) / total

def hamming_distance(la, lb):
    N = len(la)
    assert N == len(lb)
  
    def _hamming_distance(s1, s2):
        n = len(s1)
        assert n == len(s2)
        c = 0
        for i, j in zip(s1, s2):
            if i != j:
                c += 1
        return c
  
    dis = 0
    for i in range(N):
        line1 = la[i]
        line2 = lb[i]
        dis += _hamming_distance(line1, line2)
    return dis / N

def count_parameters_in_MB(model):
    return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6

class FSDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets=None, train=True, sos_id=-1, eos_id=-1):
        super(FSDataset, self).__init__()
        if targets is not None:
            assert len(inputs) == len(targets)
        self.inputs = copy.deepcopy(inputs)
        self.targets = copy.deepcopy(targets)
        self.train = train
        self.sos_id = sos_id
        self.eos_id = eos_id
        # self.swap = swap
    
    def __getitem__(self, index):
        encoder_input = self.inputs[index]
        encoder_target = None
        if self.targets is not None:
            encoder_target = self.targets[index]
        encoder_input[encoder_input==-1] = self.eos_id
        # if self.swap:
        #     a = np.random.randint(0, 5)
        #     b = np.random.randint(0, 5)
        #     encoder_input = encoder_input[:4 * a] + encoder_input[4 * a + 2:4 * a + 4] + \
        #                     encoder_input[4 * a:4 * a + 2] + encoder_input[4 * (a + 1):20 + 4 * b] + \
        #                     encoder_input[20 + 4 * b + 2:20 + 4 * b + 4] + encoder_input[20 + 4 * b:20 + 4 * b + 2] + \
        #                     encoder_input[20 + 4 * (b + 1):]

        if self.train:
            decoder_input = torch.cat((torch.tensor([self.sos_id]), encoder_input[:-1]))
            sample = {
                'encoder_input': encoder_input.long(),
                'encoder_target': encoder_target,
                'decoder_input': decoder_input.long(),
                'decoder_target': encoder_input.long(),
            }
        else:
            sample = {
                'encoder_input': encoder_input.long(),
                'decoder_target': encoder_input.long(),
            }
            if encoder_target is not None:
                sample['encoder_target'] = encoder_target
        return sample
    
    def __len__(self):
        return len(self.inputs)

# Records

In [16]:
class Record(object):
    def __init__(self, operation, performance):
        if isinstance(operation, List):
            self.operation = np.array(operation)
        elif isinstance(operation, torch.Tensor):
            self.operation = operation.numpy()
        else:
            assert isinstance(operation, np.ndarray)
            self.operation = operation
        self.performance = performance

    def get_permutated(self):
        pass

    def get_ordered(self):
        pass

    def repeat(self):
        pass

    def __eq__(self, other):
        if not isinstance(other, Record):
            return False
        return self.__hash__() == other.__hash__()

    def __hash__(self):
        return str(self.operation).__hash__()

class SelectionRecord(Record):
    def __init__(self, operation, performance):
        super().__init__(operation, performance)
        self.max_size = operation.shape[0]

    def _get_ordered(self):
        indice_select = torch.arange(0, self.max_size)[self.operation == 1]
        return indice_select, torch.FloatTensor([self.performance])

    def get_permutated(self, num=25, padding=True, padding_value=-1):
        ordered, performance = self._get_ordered()
        size = ordered.shape[0]
        shuffled_indices = torch.empty(num + 1, size)
        shuffled_indices[0] = ordered
        label = performance.unsqueeze(0).repeat(num + 1, 1)
        for i in range(num):
            shuffled_indices[i + 1] = ordered[torch.randperm(size)]
        if padding and size < self.max_size:
            shuffled_indices = F.pad(shuffled_indices, (0, (self.max_size - size)), 'constant', padding_value)
        return shuffled_indices, label

    def repeat(self, num=25, padding=True, padding_value=-1):
        ordered, performance = self._get_ordered()
        size = ordered.shape[0]
        label = performance.unsqueeze(0).repeat(num + 1, 1)
        indices = ordered.unsqueeze(0).repeat(num+1, 1)
        if padding and size < self.max_size:
            indices = F.pad(indices, (0, (self.max_size - size)), 'constant', padding_value)
            return indices, label


class RecordList(object):
    def __init__(self):
        self.r_list = set()

    def append(self, op, val):
        self.r_list.add(SelectionRecord(op, val))

    def __len__(self):
        return len(self.r_list)

    def generate(self, num=25, padding=True, padding_value=-1):
        results = []
        labels = []
        for record in self.r_list:
            result, label = record.get_permutated(num, padding, padding_value)
            results.append(result)
            labels.append(label)

        return torch.cat(results, 0), torch.cat(labels, 0)

# Feature Enviroment
## Generates different feature sequences for model training 

In [17]:
base_path = os.getcwd()

base_path = os.path.dirname(base_path)

print(base_path)

c:\Users\Trey\OneDrive - Florida Atlantic University\source\cancer_class\multiclass_lungcancer_classification_models


In [18]:
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')

In [19]:
# Seperating data from labels
pos_neg_labels, stage_labels, subtype_labels = labeled_miRNA_data.iloc[:, -2], labeled_miRNA_data.iloc[:, -2], labeled_miRNA_data.iloc[:, -1]

# Condensing stage information into a general diagnosis
# 0 == negative, 1 == posistive for lung cancer
pos_neg_labels = pos_neg_labels.apply(lambda x: 1 if x > 1 else x)

miRNA_data = labeled_miRNA_data.iloc[:, : -2]
miRNA_data['label'] = pos_neg_labels

In [20]:
MEASUREMENT = {
    'mcls' : ['precision', 'recall', 'mif1', 'maf1']
}

model_performance = {
    'mcls':namedtuple('ModelPerformance', MEASUREMENT['mcls']),
}

class Evaluator(object):
    def __init__(self, task, task_type=None, dataset=None):
        self.original_report = None
        self.records = RecordList()
        self.task_name = task
        if task_type is None:
            # self.task_type = TASK_DICT[self.task_name]
             self.task_type = 'mcls'
        else:
            self.task_type = task_type

        if task == "miRNA":
            original = miRNA_data
        elif dataset is None:
            data_path = os.path.join(base_path, "processed_data", self.task_name + "_stage_subtype.csv")
            original = pd.read_csv(data_path)
        else:
            original = dataset
        col = np.arange(original.shape[1])
        self.col_names = original.columns
        original.columns = col
        y = original.iloc[:, -1]
        x = original.iloc[:, :-1]
        self.original = original.fillna(value=0)
        y = self.original.iloc[:, -1]
        x = self.original.iloc[:, :-1]
        # 80%数据用于构建embedding space 另外20%用于测试
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                            random_state=0, shuffle=True)

        self.train = pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)], axis=1)
        self.test = pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], axis=1)
        info('initialize the train and test dataset')
        self._check_path()

    def __len__(self):
        return len(self.records)

    def generate_data(self, operation, flag):
        pass

    def get_performance(self, data=None):
        if data is None:
            data = self.original
        return downstream_task_new(data, self.task_type)

    def report_ds(self):
        pass

    def _store_history(self, choice, performance):
        self.records.append(choice, performance)

    def _flush_history(self, choices, performances, is_permuted, num, padding):
        if is_permuted:
            flag_1 = 'augmented'
        else:
            flag_1 = 'original'
        if padding:
            flag_2 = 'padded'
        else:
            flag_2 = 'not_padded'
        torch.save(choices, f'{base_path}/history/{self.task_name}/choice.{flag_1}.{flag_2}.{num}.pt')
        info(f'save the choice to {base_path}/history/{self.task_name}/choice.pt')
        torch.save(performances, f'{base_path}/history/{self.task_name}/performance.{flag_1}.{flag_2}.{num}.pt')
        info(f'save the performance to {base_path}/history/{self.task_name}/performance.pt')

    def _check_path(self):
        if not os.path.exists(f'{base_path}/history/{self.task_name}'):
            os.makedirs(f'{base_path}/history/{self.task_name}')

    def save(self, num=25, padding=True, padding_value=-1):
        if num > 0:
            is_permuted = True
        else:
            is_permuted = False
        info('save the records...')
        choices, performances = \
            self.records.generate(num=num, padding=padding, padding_value=padding_value)
        self._flush_history(choices, performances, is_permuted, num, padding)

    def get_record(self, num=0, eos=-1):
        results = []
        labels = []
        print(len(self.records.r_list))
        for record in self.records.r_list:
            result, label = record.get_permutated(num, True, eos)
            print(f"Record: {record}, Label: {label}")
            results.append(result)
            labels.append(label)
        # print(result)
        # print(labels)
        return torch.cat(results, 0), torch.cat(labels, 0)

    def get_triple_record(self, num=0, eos=-1, mode='ht'):
        h_results = []
        labels = []
        t_results = []
        h_seed = []
        labels_seed = []
        for record in self.records.r_list:
            if mode.__contains__('h'):
                h, label = record.get_permutated(num, True, eos)
            else:
                h, label = record.repeat(num, True, eos)
            if mode.__contains__('t'):
                t, _ = record.get_permutated(num, True, eos)
            else:
                t, _ = record.repeat(num, True, eos)
            h_results.append(h)
            t_results.append(t)
            labels.append(label)
            h_seed.append(h_results[0])
            labels_seed.append(labels[0])
        return torch.cat(h_results, 0), torch.cat(labels, 0), torch.cat(t_results), \
               torch.cat(h_seed), torch.cat(labels_seed),

    def report_performance(self, choice, store=True, rp=True, flag='', init_seed=0):
        flag = 'train'
        opt_ds = self.generate_data(choice, flag)
        a, b, c, d = test_task_new(opt_ds, task=self.task_type, init_seed=init_seed)
        report = model_performance[self.task_type](a, b, c, d)
        if flag == 'test':
            store = False
        if self.original_report is None:
            a, b, c, d = test_task_new(self.train, task=self.task_type, init_seed=init_seed)
            self.original_report = (a, b, c, d)
        else:
            a, b, c, d = self.original_report
        original_report = model_performance[self.task_type](a, b, c, d)
        if self.task_type == 'reg':
            final_result = report.rae
            if rp:
                info('1-MAE on original is: {:.4f}, 1-MAE on generated is: {:.4f}'.
                     format(original_report.mae, report.mae))
                info('1-MSE on original is: {:.4f}, 1-MSE on generated is: {:.4f}'.
                     format(original_report.mse, report.mse))
                info('1-RAE on original is: {:.4f}, 1-RAE on generated is: {:.4f}'.
                     format(original_report.rae, report.rae))
                info('1-RMSE on original is: {:.4f}, 1-RMSE on generated is: {:.4f}'.
                     format(original_report.rmse, report.rmse))
        elif self.task_type == 'cls':
            final_result = report.f1_score
            if rp:
                info('Pre on original is: {:.4f}, Pre on generated is: {:.4f}'.
                     format(original_report.precision, report.precision))
                info('Rec on original is: {:.4f}, Rec on generated is: {:.4f}'.
                     format(original_report.recall, report.recall))
                info('F-1 on original is: {:.4f}, F-1 on generated is: {:.4f}'.
                     format(original_report.f1_score, report.f1_score))
                info('ROC/AUC on original is: {:.4f}, ROC/AUC on generated is: {:.4f}'.
                     format(original_report.roc_auc, report.roc_auc))
        elif self.task_type == 'det':
            final_result = report.ras
            if rp:
                info(
                    'Average Precision Score on original is: {:.4f}, Average Precision Score on generated is: {:.4f}'
                    .format(original_report.map, report.map))
                info(
                    'F1 Score on original is: {:.4f}, F1 Score on generated is: {:.4f}'
                    .format(original_report.f1_score, report.f1_score))
                info(
                    'ROC AUC Score on original is: {:.4f}, ROC AUC Score on generated is: {:.4f}'
                    .format(original_report.ras, report.ras))
                info(
                    'Recall on original is: {:.4f}, Recall Score on generated is: {:.4f}'
                    .format(original_report.recall, report.recall))
        elif self.task_type == 'mcls':
            final_result = report.mif1
            if rp:
                info('Pre on original is: {:.4f}, Pre on generated is: {:.4f}'.
                     format(original_report.precision, report.precision))
                info('Rec on original is: {:.4f}, Rec on generated is: {:.4f}'.
                     format(original_report.recall, report.recall))
                info('Micro-F1 on original is: {:.4f}, Micro-F1 on generated is: {:.4f}'.
                     format(original_report.mif1, report.mif1))
                info('Macro-F1 on original is: {:.4f}, Macro-F1 on generated is: {:.4f}'.
                     format(original_report.maf1, report.maf1))
        else:
            error('wrong task name!!!!!')
            assert False
        if store:
            self._store_history(choice, final_result)
        return final_result


class FeatureEvaluator(Evaluator):
    def __init__(self, task, task_type=None, dataset=None):
        super().__init__(task, task_type, dataset)
        self.ds_size = self.original.shape[1] - 1

    def generate_data(self, choice, flag=''):
        # choice = choice.T
        if choice.shape[0] != self.ds_size:
            print(choice.shape)
            print(self.ds_size)
            error('wrong shape of choice')
            assert False
        if flag == 'test':
            ds = self.test
        elif flag == 'train':
            ds = self.train
        else:
            ds = self.original
        X = ds.iloc[:, :-1]
        indice = torch.arange(0, self.ds_size)[choice == 1].numpy() # trying conversion
        X = X.iloc[:, indice].astype(np.float64)
        y = ds.iloc[:, -1].astype(np.float64)
        Dg = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
        return Dg

    def _full_mask(self):
        return torch.FloatTensor([1] * self.ds_size)

    def report_ds(self):
        per = self.get_performance()
        info(f'current dataset : {self.task_name}')
        info(f'the size of shape is : {self.original.shape[1]}')
        info(f'original performance is : {per}')
        self._store_history(self._full_mask(), per)

import time
if __name__ == '__main__':
    fe = FeatureEvaluator(args.task_name)
    
    start_time = time.time()
    fe.report_ds()
    end_time = time.time()
    info(f'training on overall eval cost : {end_time - start_time}s')
    for method in ['RF', 'XGB', 'SVM', 'KNN', 'Ridge', 'DT', 'LASSO']:
        info(method)
        start_time = time.time()
        p, std = downstream_task_by_method_std(fe.original, fe.task_type, method)
        # print(p, std)
        end_time = time.time()
        info(f'training on {method} eval cost : {end_time - start_time}s')
    choice, labels = fe.get_record(0, eos=fe.ds_size)
    fe.report_performance(choice[0]) # NEED CHOICE TO REPORT PERFORMANCE 
    # print(labels)
    fe.save()
    file_path = f'{base_path}/history/{args.task_name}/fe.pkl'
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(fe, f)

INFO:  initialize the train and test dataset
INFO:  current dataset : miRNA
INFO:  the size of shape is : 1882
INFO:  original performance is : 0.9899208244313185
INFO:  training on overall eval cost : 0.8734791278839111s
INFO:  RF
INFO:  training on RF eval cost : 2.0530478954315186s
INFO:  XGB
INFO:  training on XGB eval cost : 2.0559518337249756s
INFO:  SVM
INFO:  training on SVM eval cost : 2.0496556758880615s
INFO:  KNN
INFO:  training on KNN eval cost : 2.0814359188079834s
INFO:  Ridge
INFO:  training on Ridge eval cost : 2.0381853580474854s
INFO:  DT
INFO:  training on DT eval cost : 2.0244383811950684s
INFO:  LASSO
INFO:  training on LASSO eval cost : 2.022604465484619s
1
Record: <__main__.SelectionRecord object at 0x000001CA6E617260>, Label: tensor([[0.9899]])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


INFO:  Pre on original is: 0.4914, Pre on generated is: 0.4914
INFO:  Rec on original is: 0.5000, Rec on generated is: 0.5000
INFO:  Micro-F1 on original is: 0.9829, Micro-F1 on generated is: 0.9829
INFO:  Macro-F1 on original is: 0.4957, Macro-F1 on generated is: 0.4957
INFO:  save the records...
INFO:  save the choice to c:\Users\Trey\OneDrive - Florida Atlantic University\source\cancer_class\multiclass_lungcancer_classification_models/history/miRNA/choice.pt
INFO:  save the performance to c:\Users\Trey\OneDrive - Florida Atlantic University\source\cancer_class\multiclass_lungcancer_classification_models/history/miRNA/performance.pt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Encoder

In [21]:
class Encoder(nn.Module):
    def __init__(self,
                 layers,
                 vocab_size,
                 hidden_size):
        super().__init__()
        self.layers = layers
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)

    def infer(self, x, predict_lambda, direction='-'):
        encoder_outputs, encoder_hidden, seq_emb, predict_value, mu, logvar = self(x)
        grads_on_outputs = torch.autograd.grad(predict_value, encoder_outputs, torch.ones_like(predict_value))[0]
        if direction == '+':
            new_encoder_outputs = encoder_outputs + predict_lambda * grads_on_outputs
        elif direction == '-':
            new_encoder_outputs = encoder_outputs - predict_lambda * grads_on_outputs
        else:
            raise ValueError('Direction must be + or -, got {} instead'.format(direction))
        new_encoder_outputs = F.normalize(new_encoder_outputs, 2, dim=-1)
        new_seq_emb = torch.mean(new_encoder_outputs, dim=1)
        new_seq_emb = F.normalize(new_seq_emb, 2, dim=-1)
        return encoder_outputs, encoder_hidden, seq_emb, predict_value, new_encoder_outputs, new_seq_emb

    def forward(self, x):
        pass

class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # 初始化Shape为(max_len, d_model)的PE (positional encoding)
        pe = torch.zeros(max_len, d_model)
        # 初始化一个tensor [[0, 1, 2, 3, ...]]
        position = torch.arange(0, max_len).unsqueeze(1)
        # 这里就是sin和cos括号中的内容，通过e和ln进行了变换
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        # 计算PE(pos, 2i)
        pe[:, 0::2] = torch.sin(position * div_term)
        # 计算PE(pos, 2i+1)
        pe[:, 1::2] = torch.cos(position * div_term)
        # 为了方便计算，在最外面在unsqueeze出一个batch
        pe = pe.unsqueeze(0)
        # 如果一个参数不参与梯度下降，但又希望保存model的时候将其保存下来
        # 这个时候就可以用register_buffer
        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        x 为embedding后的inputs,例如(1,7, 128),batch size为1,7个单词,单词维度为128
        """
        # 将x和positional encoding相加。
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

class TransformerEncoderVAE(Encoder):
    def __init__(
            self,
            num_encoder_layers,
            nhead, 
            vocab_size,
            embedding_size,
            dropout, 
            activation,
            dim_feedforward,
            batch_first,
            mlp_layers,
            mlp_hidden_size,
            mlp_dropout,
            d_latent_dim,
            ):
        super(TransformerEncoderVAE, self).__init__(num_encoder_layers, vocab_size, embedding_size)
        # positional layer
        self.positionalEncoding = PositionalEncoding(
                                d_model = embedding_size,
                                dropout = dropout,
                                max_len = vocab_size)
        # multi-head attention && feed forward && norm -> encoder layer 
        self.encoderLayer = nn.TransformerEncoderLayer(
                                d_model = embedding_size,
                                nhead = nhead,
                                dropout = dropout,
                                activation = activation,
                                dim_feedforward = dim_feedforward,
                                batch_first = batch_first)
        # stack encoder layers to construct transformer encoder
        self.encoder = nn.TransformerEncoder(
                                encoder_layer = self.encoderLayer,
                                num_layers = num_encoder_layers)
        self.mu = nn.Linear(embedding_size, d_latent_dim)
        self.logvar = nn.Linear(embedding_size, d_latent_dim)
        # mlp layer
        self.mlp = nn.Sequential()
        for i in range(mlp_layers):
            if i == 0:
                self.mlp.add_module('layer_{}'.format(i), nn.Sequential(
                    nn.Linear(d_latent_dim, mlp_hidden_size),
                    nn.ReLU(inplace=False),
                    nn.Dropout(p=mlp_dropout)))
            else:
                self.mlp.add_module('layer_{}'.format(i), nn.Sequential(
                    nn.Linear(mlp_hidden_size, mlp_hidden_size),
                    nn.ReLU(inplace=False),
                    nn.Dropout(p=mlp_dropout)))
        self.regressor = nn.Linear(d_latent_dim if mlp_layers == 0 else mlp_hidden_size, 1)
    
    def reparameterize(self, mu, logvar):
        # epsilon: 噪声
        # epsilon = torch.randn_like(mu)
        epsilon = 1
        return mu + epsilon * torch.exp(logvar/2)
    
    def forward(self, x):
        # get embedding
        embedding = self.embedding(x)
        # add positional information
        embedding = self.positionalEncoding(embedding)
        
        # encoder output
        out = self.encoder(embedding)
        out = F.normalize(out, 2, dim=-1)
        encoder_outputs = out
        
        # add all embedding and compute mean / summarize
        out = torch.mean(out, dim=1)
        out = F.normalize(out, 2, dim=-1)
        seq_emb = out

        # compute mu, logvar to compute the KL-loss
        mu, logvar = self.mu(out), self.logvar(out)
        # reparameterize
        out = self.reparameterize(mu, logvar)
        
        # evaluator
        out = self.mlp(out)
        out = self.regressor(out)
        predict_value = torch.sigmoid(out)
        # 适配RNNEncoder的输出
        encoder_hidden = None
        # encoder_outputs, encoder_hidden, seq_emb, predict_value
        # encoder_outputs shape (batch_size, sequence_length, embedding_dim)
        return encoder_outputs, encoder_hidden, seq_emb, predict_value, mu, logvar
    

def construct_encoder(fe: FeatureEvaluator, args) -> Encoder:
    name = args.method_name
    size = fe.ds_size
    info(f'Construct Encoder with method {name}...')
    if name == 'transformerVae':
        return TransformerEncoderVAE(
            num_encoder_layers = args.transformer_encoder_layers,
            nhead = args.encoder_nhead,
            vocab_size = size + 1,
            embedding_size = args.encoder_embedding_size,
            dropout = args.transformer_encoder_dropout,
            activation = args.transformer_encoder_activation,
            dim_feedforward = args.encoder_dim_feedforward,
            batch_first = args.batch_first,
            mlp_layers = args.mlp_layers,
            mlp_hidden_size = args.mlp_hidden_size,
            mlp_dropout = args.encoder_dropout,
            d_latent_dim = args.d_latent_dim
        )
    else:
        assert False

# Decoder

In [22]:
class Attention(nn.Module):
    def __init__(self, input_dim, source_dim=None, output_dim=None, bias=False):
        super(Attention, self).__init__()
        if source_dim is None:
            source_dim = input_dim
        if output_dim is None:
            output_dim = input_dim
        self.input_dim = input_dim
        self.source_dim = source_dim
        self.output_dim = output_dim
        self.input_proj = nn.Linear(input_dim, source_dim, bias=bias)
        self.output_proj = nn.Linear(input_dim + source_dim, output_dim, bias=bias)
        self.mask = None

    def set_mask(self, mask):
        self.mask = mask

    def forward(self, input, source_hids):
        batch_size = input.size(0)
        source_len = source_hids.size(1)

        # (batch, tgt_len, input_dim) -> (batch, tgt_len, source_dim)
        x = self.input_proj(input)

        # (batch, tgt_len, source_dim) * (batch, src_len, source_dim) -> (batch, tgt_len, src_len)
        attn = torch.bmm(x, source_hids.transpose(1, 2))
        if self.mask is not None:
            attn.data.masked_fill_(self.mask, -float('inf'))
        attn = F.softmax(attn.view(-1, source_len), dim=1).view(batch_size, -1, source_len)

        # (batch, tgt_len, src_len) * (batch, src_len, source_dim) -> (batch, tgt_len, source_dim)
        mix = torch.bmm(attn, source_hids)

        # concat -> (batch, tgt_len, source_dim + input_dim)
        combined = torch.cat((mix, input), dim=2)
        # output -> (batch, tgt_len, output_dim)
        output = torch.tanh(self.output_proj(combined.view(-1, self.input_dim + self.source_dim))).view(batch_size, -1,
                                                                                                        self.output_dim)

        return output, attn

class Decoder(nn.Module):
    KEY_ATTN_SCORE = 'attention_score'
    KEY_LENGTH = 'length'
    KEY_SEQUENCE = 'sequence'

    def __init__(self,
                 layers,
                 vocab_size,
                 hidden_size,
                 dropout,
                 length, gpu):
        super(Decoder, self).__init__()
        self.layers = layers
        self.hidden_size = hidden_size
        self.length = length  # total length to decode
        self.vocab_size = vocab_size
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)
        self.sos_id = vocab_size - 1
        self.eos_id = vocab_size - 1
        self.gpu = gpu


class TransformerDecoder(Decoder):
    def __init__(self, 
                 num_decoder_layers, 
                 nhead, 
                 vocab_size, 
                 embedding_size,
                 dropout, 
                 activation,
                 dim_feedforward,
                 batch_first,
                 length, 
                 gpu):
        super(TransformerDecoder, self).__init__(
            num_decoder_layers, 
            vocab_size, 
            embedding_size, 
            dropout, 
            length, 
            gpu)
        self.embedding_size = embedding_size
        # positional layer
        self.positionalEncoding = PositionalEncoding(
                                d_model = embedding_size,
                                dropout = dropout,
                                max_len = vocab_size)
        # decoder layer
        self.decoderLayer = nn.TransformerDecoderLayer(
                                d_model = embedding_size,
                                nhead = nhead,
                                dropout = dropout,
                                activation = activation,
                                dim_feedforward = dim_feedforward,
                                batch_first = batch_first)
        # stack decoder layer to construct transformer decoder
        self.decoder = nn.TransformerDecoder(
                                decoder_layer = self.decoderLayer,
                                num_layers = num_decoder_layers)
        self.attention = Attention(embedding_size)
        # out put
        self.out = nn.Linear(embedding_size, vocab_size)
        
    def forward_train_valid(self, x, encoder_outputs):
        batch_size = x.shape[0]
        output_size = x.shape[1]
        # By passing cuda for now 
        # x = x.cuda(self.gpu)
        x.to('cpu')
        embedded = self.embedding(x)
        embedded = self.positionalEncoding(embedded)
        
        # construct squence mask
        # tgt_mask = nn.Transformer.generate_square_subsequent_mask(output_size).cuda(self.gpu)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(output_size).to('cpu')
        out = self.decoder(embedded, encoder_outputs, tgt_mask)

        # out, attn = self.attention(out, encoder_outputs)

        predict_softmax = F.log_softmax(self.out(out.contiguous().view(-1, self.embedding_size)), dim=1)
        predict_softmax = predict_softmax.view(batch_size, output_size, -1)
        # print(predict_softmax)
        return predict_softmax

    def forward_step(self, encoder_outputs, input_id):
        embedded = self.embedding(input_id)
        embedded = self.positionalEncoding(embedded)
        # tgt_mask = nn.Transformer.generate_square_subsequent_mask(input_id.shape[1]).cuda(self.gpu)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(input_id.shape[1]).to('cpu')
        out = self.decoder(embedded, encoder_outputs, tgt_mask)

        # out, attn = self.attention(out, encoder_outputs)

        predict_softmax = F.log_softmax(self.out(out.contiguous().view(-1, self.embedding_size)), dim=1)
        _, next_input_id = predict_softmax.max(dim=1, keepdim=True)
        output_id = next_input_id.reshape(input_id.shape[0], input_id.shape[1])
        return output_id
    
    def forward_infer(self, encoder_outputs):
        # initialize the input id
        batch_size = encoder_outputs.shape[0]
        input_id = torch.LongTensor([self.sos_id] * batch_size).view(batch_size, 1).to('cpu')
        # input_id = torch.LongTensor([self.sos_id] * batch_size).view(batch_size, 1).cuda(self.gpu)
        for step in range(self.length):
            output_id = self.forward_step(encoder_outputs, input_id)
            input_id = torch.cat((input_id, output_id[:,-1].reshape(-1, 1)), dim=1)
        return output_id
    

def construct_decoder(fe: FeatureEvaluator, args) -> Decoder:
        name = args.method_name
        size = fe.ds_size
        info(f'Construct Decoder with method {name}...')
        if name == 'transformer' or name == 'transformerVae':
            return TransformerDecoder(
                num_decoder_layers = args.transformer_decoder_layers,
                nhead = args.decoder_nhead,
                vocab_size = size + 1,
                embedding_size = args.decoder_embedding_size,
                dropout = args.transformer_decoder_dropout,
                activation = args.transformer_decoder_activation,
                dim_feedforward = args.decoder_dim_feedforward,
                batch_first = args.batch_first,
                length = size,
                gpu = args.gpu
            )
        else:
            assert False

# Gradient Optimization

In [23]:
SOS_ID = 0
EOS_ID = 0


# gradient based automatic feature selection
class GAFS(nn.Module):
    def __init__(self,
                 fe:FeatureEvaluator,
                 args
                 ):
        super(GAFS, self).__init__()
        self.style = args.method_name
        self.gpu = args.gpu
        self.encoder = construct_encoder(fe, args)
        self.decoder = construct_decoder(fe, args)

    def flatten_parameters(self):
        self.encoder.rnn.flatten_parameters()
        self.decoder.rnn.flatten_parameters()

    def forward(self, input_variable, target_variable=None):
        mu = 0.0
        logvar = 0.0
        if self.style == "transformerVae":
            encoder_outputs, encoder_hidden, feat_emb, predict_value, mu, logvar = self.encoder.forward(input_variable)
            decoder_outputs = self.decoder.forward_train_valid(target_variable, encoder_outputs)
            _, feat = decoder_outputs.max(2, keepdim=True)
            feat = feat.reshape(input_variable.size(0), input_variable.size(1))

        return predict_value, decoder_outputs, feat, mu, logvar


    def generate_new_feature(self, input_variable, predict_lambda=1, direction='-'):
        if self.style == 'transformer' or self.style == 'transformerVae':
            encoder_outputs, encoder_hidden, feat_emb, predict_value, new_encoder_outputs, new_feat_emb = \
                self.encoder.infer(input_variable, predict_lambda, direction=direction)
            new_feat_seq = self.decoder.forward_infer(new_encoder_outputs)
        return new_feat_seq

# Training

In [None]:
baseline_name = [
    'kbest',
    'mrmr',
    'lasso',
    'rfe',
    # 'gfs',
    'lassonet',
    'sarlfs',
    'marlfs',

]

def gafs_train(train_queue, model: GAFS, optimizer):
    objs = AvgrageMeter()
    mse = AvgrageMeter()
    nll = AvgrageMeter()
    kl = AvgrageMeter()
    model.train()
    for step, sample in enumerate(train_queue):
        encoder_input = sample['encoder_input']
        encoder_target = sample['encoder_target']
        decoder_input = sample['decoder_input']
        decoder_target = sample['decoder_target']

        # Excluding cuda

        # encoder_input = encoder_input.cuda(model.gpu)
        # encoder_target = encoder_target.cuda(model.gpu).requires_grad_()
        # decoder_input = decoder_input.cuda(model.gpu)
        # decoder_target = decoder_target.cuda(model.gpu)

        encoder_input = encoder_input.to('cpu')
        encoder_target = encoder_target.to('cpu').requires_grad_()
        decoder_input = decoder_input.to('cpu')
        decoder_target = decoder_target.to('cpu')

        optimizer.zero_grad()
        predict_value, log_prob, arch, mu, logvar= model.forward(encoder_input, decoder_input)
        # 都没有除以batch size，如果需要除以batch size，令 reduction = "mean"
        loss_1 = F.mse_loss(predict_value.squeeze(), encoder_target.squeeze()) # mse loss
        loss_2 = F.nll_loss(log_prob.contiguous().view(-1, log_prob.size(-1)), decoder_target.view(-1)) # ce loss
        if args.method_name == "transformerVae": 
            kl_loss = (-0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()))
            if args.pre_train == 'True':
                loss = args.trade_off * loss_1 + (1 - args.trade_off) * loss_2
            else:
                loss = loss = args.trade_off * loss_1 + (1 - args.trade_off) * loss_2 + 0.001 * kl_loss
        elif args.method_name == "transformer": 
            kl_loss = torch.tensor(1, dtype=torch.long)
            loss = args.trade_off * loss_1 + (1 - args.trade_off) * loss_2
            # loss = loss_1 + loss_2
            # l2_regularization = torch.tensor(0.0).cuda(model.gpu)
            # for param in model.parameters():
            #     l2_regularization += torch.norm(param, 2).cuda(model.gpu)
            # loss += args.l2_reg * l2_regularization 
        else:
            kl_loss = torch.tensor(1, dtype=torch.long)
            loss = args.trade_off * loss_1 + (1 - args.trade_off) * loss_2
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_bound)
        optimizer.step()

        n = encoder_input.size(0)
        objs.update(loss.data, n)
        mse.update(loss_1.data, n)
        nll.update(loss_2.data, n)
        kl.update(kl_loss.data, n)
    return objs.avg, mse.avg, nll.avg, kl.avg


def gafs_valid(queue, model: GAFS):
    pa = AvgrageMeter()
    hs = AvgrageMeter()
    mse = AvgrageMeter()
    ce = AvgrageMeter()
    with torch.no_grad():
        model.eval()
        for step, sample in enumerate(queue):
            encoder_input = sample['encoder_input']
            encoder_target = sample['encoder_target']
            decoder_input = sample['decoder_input']
            decoder_target = sample['decoder_target']

            # Excluding cuda

            # encoder_input = encoder_input.cuda(model.gpu)
            # encoder_target = encoder_target.cuda(model.gpu)
            # decoder_input = decoder_input.cuda(model.gpu)
            # decoder_target = decoder_target.cuda(model.gpu)

            encoder_input = encoder_input.to('cpu')
            encoder_target = encoder_target.to('cpu')
            decoder_input = decoder_input.to('cpu')
            decoder_target = decoder_target.to('cpu')

            predict_value, logits, arch, mu, logvar = model.forward(encoder_input, decoder_input)
            n = encoder_input.size(0)
            # print(encoder_target)
            pairwise_acc = pairwise_accuracy(encoder_target.data.squeeze().tolist(),
                                             predict_value.data.squeeze().tolist())
            hamming_dis = hamming_distance(decoder_target.data.squeeze().tolist(), arch.data.squeeze().tolist())
            mse.update(F.mse_loss(predict_value.data.squeeze(), encoder_target.data.squeeze()), n)
            pa.update(pairwise_acc, n)
            hs.update(hamming_dis, n)
            ce.update(F.nll_loss(logits.contiguous().view(-1, logits.size(-1)), decoder_target.view(-1)), n)
    return mse.avg, pa.avg, hs.avg, ce.avg


def choice_to_onehot(choice: List[int]):
    size = len(choice)
    onehot = torch.zeros(size + 1)
    onehot[torch.tensor(choice)] = 1
    return onehot[:-1]
    # if choice.dim() == 1:
    #     selected = torch.zeros_like(choice)
    #     selected[choice] = 1
    #     return selected[1:-1]
    # else:
    #     onehot = torch.empty_like(choice)
    #     for i in range(choice.shape[0]):
    #         onehot[i] = choice_to_onehot(choice[i])
    #     return onehot


def gafs_infer(queue, model, step, direction='+'):
    new_gen_list = []
    model.eval()
    for i, sample in enumerate(queue):
        encoder_input = sample['encoder_input']
        encoder_input = encoder_input.to('cpu')
        # Excluding cuda
        # encoder_input = encoder_input.cuda(model.gpu)
        model.zero_grad()
        new_gen = model.generate_new_feature(encoder_input, predict_lambda=step, direction=direction)
        new_gen_list.extend(new_gen.data.squeeze().tolist())
    return new_gen_list


def select_top_k(choice: Tensor, labels: Tensor, k: int) -> (Tensor, Tensor):
    values, indices = torch.topk(labels, min(k, labels.shape[0]), dim=0)
    return choice[indices.squeeze()], labels[indices.squeeze()]


def main():
    if not torch.cuda.is_available():
        info('No GPU found!')
        #sys.exit(1)
    # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(x) for x in args.gpu)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # torch.cuda.manual_seed(args.seed)
    # torch.cuda.manual_seed_all(args.seed)
    # cudnn.enabled = True
    # cudnn.benchmark = False
    # cudnn.deterministic = True
    # device = int(args.gpu)

    device = 'cpu'
    info(f"Args = {args}")

    with open(f'{base_path}/history/{args.task_name}/fe.pkl', 'rb') as f:
        fe: FeatureEvaluator = pickle.load(f)
    model = GAFS(fe, args)
    if args.pre_train == "False":
        model.load_state_dict(torch.load(f'{base_path}/history/{args.task_name}/GAFS_pretrain_{args.method_name}.model_dict'))
    elif args.pre_train == "Search":
        model.load_state_dict(torch.load(f'{base_path}/history/{args.task_name}/GAFS_{args.method_name}.model_dict'))
    
    info(f"param size = {count_parameters_in_MB(model)}MB")
    # Exculding cuda
    # model = model.cuda(device)

    # 设置为0表示不进行数据增强
    choice, labels = fe.get_record(args.gen_num, eos=fe.ds_size)
    valid_choice, valid_labels = fe.get_record(0, eos=fe.ds_size)

    info('Training Encoder-Predictor-Decoder')
    min_val = min(labels)
    max_val = max(labels)
    train_encoder_target = [(i - min_val) / (max_val - min_val) for i in labels] # min == max 
    valid_encoder_target = [(i - min_val) / (max_val - min_val) for i in valid_labels]

    train_dataset = FSDataset(choice, train_encoder_target, train=True, sos_id=fe.ds_size, eos_id=fe.ds_size)
    valid_dataset = FSDataset(valid_choice, valid_encoder_target, train=True, sos_id=fe.ds_size, eos_id=fe.ds_size)
    train_queue = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True)
    valid_queue = torch.utils.data.DataLoader(
        valid_dataset, batch_size=len(valid_dataset), shuffle=False, pin_memory=True)
    nao_optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2_reg)
    save_model = model
    cur_loss = float('inf')
    best_epoch = 0
    for nao_epoch in range(1, args.epochs + 1):
        sys.stdout.flush()
        sys.stderr.flush()
        nao_loss, nao_mse, nao_ce, kl = gafs_train(train_queue, model, nao_optimizer)
        if nao_epoch % 10 == 0 or nao_epoch == 1:
            info("epoch {:04d} train loss {:.6f} mse {:.6f} ce {:.6f}, kl {:.6f}".format(nao_epoch, nao_loss, nao_mse, nao_ce, kl))
            if nao_loss < cur_loss:
                save_model = model
                cur_loss = nao_loss
                best_epoch = nao_epoch
        if nao_epoch % 100 == 0 or nao_epoch == 1:
            mse, pa, hs, ce = gafs_valid(train_queue, model)
            info("Evaluation on train data")
            info('epoch {:04d} mse {:.6f} ce {:.6f} pairwise accuracy {:.6f} hamming distance {:.6f}'.format(nao_epoch, mse, ce, pa,
                                                                                                   hs))
            mse, pa, hs, ce = gafs_valid(valid_queue, model)
            info("Evaluation on valid data")
            info('epoch {:04d} mse {:.6f} ce {:.6f} pairwise accuracy {:.6f} hamming distance {:.6f}'.format(nao_epoch, mse, ce, pa,
                                                                                                   hs))
    model = save_model
    info("best model from epoch {:04d}".format(best_epoch))

    top_selection, top_performance = select_top_k(valid_choice, valid_labels, args.top_k)

    infer_dataset = FSDataset(top_selection, top_performance, False, sos_id=fe.ds_size, eos_id=fe.ds_size)
    infer_queue = torch.utils.data.DataLoader(infer_dataset, batch_size=len(infer_dataset), shuffle=False,
                             pin_memory=True)
    if args.method_name != "transformerVae" or (args.method_name == "transformerVae" and args.pre_train != "True"):
        new_selection = []
        new_choice = []
        predict_step_size = 0
        while len(new_selection) < args.new_gen:
            predict_step_size += 1
            info('Generate new architectures with step size {:.2f}'.format(predict_step_size))
            new_record = gafs_infer(infer_queue, model, direction='+', step=predict_step_size)
            for choice in new_record:
                onehot_choice = choice_to_onehot(choice)
                if onehot_choice.sum() <= 0:
                    error('insufficient selection')
                    continue
                record = SelectionRecord(onehot_choice.numpy(), -1)
                if record not in fe.records.r_list and record not in new_selection:
                    new_selection.append(record)
                    new_choice.append(onehot_choice)
                if len(new_selection) >= args.new_gen:
                    break
            info(f'{len(new_selection)} new choice generated now', )
            if predict_step_size > args.max_step_size:
                break
        info(f'build {len(new_selection)} new choice !!!')

        new_choice_pt = torch.stack(new_choice)
        if args.gen_num == 0:
            choice_path = f'{base_path}/history/{fe.task_name}/generated_choice_{args.method_name}.pt'
        else:
            choice_path = f'{base_path}/history/{fe.task_name}/generated_choice_{args.method_name}.pt'
        torch.save(new_choice_pt, choice_path)
        info(f'save generated choice to {choice_path}')

    # previous_optimal = float(torch.max(valid_labels))
    optimal_selection = None
    if args.pre_train == "True":
        torch.save(model.state_dict(), f'{base_path}/history/{fe.task_name}/GAFS_pretrain_{args.method_name}.model_dict')
        torch.save(model.state_dict(), f'{base_path}/history/{fe.task_name}/GAFS_{args.method_name}.model_dict')
    else:
        torch.save(model.state_dict(), f'{base_path}/history/{fe.task_name}/GAFS_{args.method_name}.model_dict')
    if args.pre_train == "True":
        return -1
    best_selection = None
    best_optimal = -1000
    best_selection_test = None
    best_optimal_test = -1000
    # info(f'the best performance for this task is {previous_optimal}')
   
    for s in new_selection:
        train_data = fe.generate_data(s.operation, 'train')
        result = fe.get_performance(train_data)
        test_data = fe.generate_data(s.operation, 'test')
        test_result = fe.get_performance(test_data)
        # if result > previous_optimal:
        #     optimal_selection = s.operation
        #     previous_optimal = result
        #     info(f'found optimal selection! the choice is {s.operation}, the performance on train is {result}')
        if result > best_optimal:
            best_selection = s.operation
            best_optimal = result
            info(f'found best on train : {best_optimal}')
        if test_result > best_optimal_test:
            best_selection_test = s.operation
            best_optimal_test = test_result
            info(f'found best on test : {best_optimal_test}')

    opt_path = f'{base_path}/history/{fe.task_name}/best-ours.hdf'
    ori_p = fe.report_performance(best_selection, flag='test')
    info(f'found train generation in our method! the choice is {best_selection}, the performance is {ori_p}')
    fe.generate_data(best_selection, 'train').to_hdf(opt_path, key='train')
    fe.generate_data(best_selection, 'test').to_hdf(opt_path, key='test')

    opt_path_test = f'{base_path}/history/{fe.task_name}/best-ours-test.hdf'
    test_p = fe.report_performance(best_selection_test, flag='test')
    info(f'found test generation in our method! the choice is {best_selection_test}, the performance is {test_p}')
    fe.generate_data(best_selection_test, 'train').to_hdf(opt_path_test, key='train')
    fe.generate_data(best_selection_test, 'test').to_hdf(opt_path_test, key='test')
    ps = []
    info('given overall validation')
    # Excluding baseline reports
    # report_head = 'RAW\t'
    # raw_test = pandas.read_hdf(f'{base_path}/history/{fe.task_name}.hdf', key='raw_test')
    # ps.append('{:.2f}'.format(fe.get_performance(raw_test) * 100))
    # for method in baseline_name:
    #     report_head += f'{method}\t'
    #     spe_test = pandas.read_hdf(f'{base_path}/history/{fe.task_name}.hdf', key=f'{method}_test')
    #     ps.append('{:.2f}'.format(fe.get_performance(spe_test) * 100))
    # report_head += 'Ours\tOurs_Test'
    # print(report_head)
    report = ''
    for per in ps:
        report += f'{per}&\t'
    report += '{:.2f}&\t'.format(ori_p * 100)
    report += '{:.2f}&\t'.format(test_p * 100)
    print(report)

if __name__ == '__main__':
    main()

  return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6


INFO:  No GPU found!
INFO:  Args = Namespace(seed=42, new_gen=200, method_name='transformerVae', task_name='miRNA', gpu=0, fe='-', top_k=5, gen_num=25, encoder_layers=1, encoder_hidden_size=64, encoder_emb_size=32, mlp_layers=2, mlp_hidden_size=200, decoder_layers=1, decoder_hidden_size=64, encoder_dropout=0, mlp_dropout=0, decoder_dropout=0, l2_reg=0.0, max_step_size=100, trade_off=0.8, epochs=200, batch_size=32, lr=0.001, optimizer='adam', grad_bound=5.0, transformer_encoder_layers=2, encoder_nhead=8, encoder_embedding_size=64, transformer_encoder_dropout=0.1, transformer_encoder_activation='relu', encoder_dim_feedforward=128, batch_first=True, d_latent_dim=64, transformer_decoder_layers=2, decoder_nhead=8, transformer_decoder_dropout=0.1, transformer_decoder_activation='relu', decoder_dim_feedforward=128, decoder_embedding_size=64, pre_train='True')
INFO:  Construct Encoder with method transformerVae...
INFO:  Construct Decoder with method transformerVae...
INFO:  param size = 0.688