# E-SEIC
Selection of evolutionary instances with constraints for unbalanced datasets

## 数据集的预处理 

In [5]:
from utils.dataset_utils import get_distribution, k_fold_cross_validation, remove_class
from instance_selection.parameter.parameter import *  # 导入参数的设定
from instance_selection.operator.init_toolbox import init_toolbox_eseic
from instance_selection.operator.metrics import calculate_gmean_mauc, calculate_average_accuracy, \
    calculate_average_gmean_mauc, calculate_accuracy
from instance_selection.operator.genetic_operator import selTournamentNDCD
from instance_selection.operator.ensemble import vote_result_ensembles, ensemble_individuals
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
import scipy.io as sio  # 从.mat文件中读取数据集
import random
from deap import tools
import warnings
import numpy as np

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Balance_Scale  # 数据集名称（包含对应的参数配置）
datasetname = DATASET.DATASETNAME.split('.')[0]
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET.DATASETNAME)  # 加载、划分数据集
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y,random_state=RANDOM_SEED)  # 划分数据集
scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
print(datasetname + f' distribution: {counts_all}')
print(f'trainset distribution: {counts_train}')
print(f'testset distribution: {counts_test}')

model = MLPClassifier(hidden_layer_sizes=(DATASET.HIDDEN_SIZE,), max_iter=DATASET.MAX_ITER,
                      random_state=RANDOM_SEED, learning_rate_init=DATASET.LEARNING_RATE)

weights_train = (1 / counts_train.astype(float)) / np.sum(1 / counts_train.astype(float))  # 计算每个类的权重，用于计算每个类别的权重
weights_test = (1 / counts_test.astype(float)) / np.sum(1 / counts_test.astype(float))  # 计算每个类的权重，用于计算每个类别的权重

num_instances = int(np.ceil(counts_train.min() * 0.9))  # 取最小数量的类的0.9（向下取整）
print("最小数量:", num_instances)

y_train_pred_proba = k_fold_cross_validation(model=clone(model), X=x_train, y=y_train, n_splits=N_SPLITS, method='soft',
                                             random_state=RANDOM_SEED)  # 交叉验证得到软标签
# 将概率转化为预测结果
y_train_pred = np.argmax(y_train_pred_proba, axis=1)

Acc1, Acc2, Acc3 = calculate_accuracy(y_train_pred, y_train, weights_train)
constraints = [Acc1, Acc2, Acc3]

model.fit(x_train, y_train)
y_test_pred_proba = model.predict_proba(x_test)
print(calculate_gmean_mauc(y_test_pred_proba, y_test))

Balance_Scale distribution: [ 49 288 288]
trainset distribution: [ 34 201 202]
testset distribution: [15 87 86]
最小数量: 31
(0.977265, 0.999229, array([0.93333333, 1.        , 1.        ]))


## E-SEIC

In [6]:
import statistics

toolbox = init_toolbox_eseic(model, x_train, y_train, weights_train, constraints, n_splits=N_SPLITS,
                             random_seed=RANDOM_SEED)  # 初始化toolbox

perfomance_per_generation = []
median_gmean_list = []
median_mauc_list = []
def main(x_train, y_train, model, balanced_method='balanced'):
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    logbook = tools.Logbook()
    logbook.header = "gen", "feasible", "ensembles_size", "median_gmean", "median_mauc", "avg_acc2"
    ####################################种群的初始化###########################
    pop = toolbox.population(n=POPSIZE)  # 个体编码默认全为0
    pop = toolbox.init_population(pop, balanced_method=balanced_method)  # 初始化种群中的个体
    toolbox.evaluate(pop)  # 计算个体的适应度
    ensemble_initial = ensemble_individuals(pop, model, x_train, y_train)
    vote_pred_prob = vote_result_ensembles(ensemble_initial, x_test)  # 默认预测结果是软标签
    gmean_init, mauc_init, _ = calculate_gmean_mauc(vote_pred_prob, y_test)
    perfomance_per_generation.append([gmean_init, mauc_init])
    
    ####################################种群的迭代#################################################
    for gen in range(1, NGEN + 21):
        offspring = selTournamentNDCD(pop, POPSIZE, tournsize=3)  # 锦标赛选择（1、先根据非支配排序的等级2、再根据拥挤距离）
        offspring = [toolbox.clone(ind) for ind in offspring]
        for i in range(0, len(offspring) - 1, 2):
            if random.random() <= CXPB:
                offspring[i], offspring[i + 1] = toolbox.mate(offspring[i], offspring[i + 1])  # 单点交叉
            offspring[i] = toolbox.mutate(offspring[i], MR)[0]  # 二进制反转突变
            offspring[i + 1] = toolbox.mutate(offspring[i + 1], MR)[0]  # 二进制反转突变
            del offspring[i].fitness.values, offspring[i + 1].fitness.values
        #############################################################合并、去重#####################################################
        offspring = toolbox.individuals_constraints(offspring)  # 限制每个类至少有一个实例被选择
        pop = pop + offspring  # 种群的合并
        pop, _ = toolbox.remove_duplicates(pop)  # 去重
        while len(pop) < POPSIZE:  # 保证种群大小为POPSIZE
            add_individual = []
            num_add = POPSIZE - len(pop)
            for i in range(0, num_add):
                index = random.randint(0, len(offspring) - 1)  # 在0-len(offspring)范围内随机产生一个索引
                offspring[index] = toolbox.mutate(offspring[index], MR)[0]  # 选择index对应的个体进行突变
                del offspring[index].fitness.values
                add_individual.append(offspring[index])
            add_individual = toolbox.individuals_constraints(add_individual)  # 限制每个类至少有一个实例被选择    
            pop = pop + add_individual  # 种群的合并
            pop, _ = toolbox.remove_duplicates(pop)  # 去重
        pop = toolbox.individuals_constraints(pop) # 限制每个类至少有5个实例被选择
        toolbox.evaluate(pop)  # 计算新种群适应度
        ###############################################得到pareto_fronts############################################
        feasible_pop, infeasible_pop = toolbox.get_feasible_infeasible(pop)  # 得到可行解与不可行解
        if len(feasible_pop) >= POPSIZE:
            pop, pareto_fronts = toolbox.select(feasible_pop, POPSIZE)
            # ensembles = pareto_fronts[0]  # pareto_first_front
            ensembles = pop  # pop均为可行解，则集成pop中所有个体
        elif len(feasible_pop) > 0:
            pop = feasible_pop + infeasible_pop[:POPSIZE - len(feasible_pop)]  # 在不可行解中选取违约程度小的个体，保证pop数量为POPSIZE
            # ensembles = tools.sortNondominated(feasible_pop, len(feasible_pop))[0]  # pareto_first_front
            ensembles = feasible_pop  # 只集成可行解
            # ensembles = pop
        else:
            pop = feasible_pop + infeasible_pop[:POPSIZE - len(feasible_pop)]  # 加入不可行解中违约程度小的个体，保证pop数量为POPSIZE
            ensembles = [infeasible_pop[0]]  # 没有可行解，集成不可行解中第一个个体
            # ensembles = pop
        avg_gmean, avg_mauc = calculate_average_gmean_mauc(ensembles)  # 计算gmean、mauc的平均值
        
        gmean_list = []
        mauc_list = []
        for ind in ensembles:
            gmean_list.append(ind.gmean)
            mauc_list.append(ind.mauc)
        # 求出list的中值
        median_gmean = statistics.median(gmean_list)
        median_mauc = statistics.median(mauc_list)
        median_gmean_list.append(median_gmean)
        median_mauc_list.append(median_mauc)
        
        _, avg_acc2, _ = calculate_average_accuracy(ensembles)  # 计算acc1、acc2、acc3的平均值
        
        ensemble_curr = ensemble_individuals(ensembles, model, x_train, y_train)
        vote_pred_prob = vote_result_ensembles(ensemble_curr, x_test)  # 默认预测结果是软标签
        gmean_curr, mauc_curr, _ = calculate_gmean_mauc(vote_pred_prob, y_test)
        perfomance_per_generation.append([gmean_curr, mauc_curr])
        
        record = stats.compile(pop)
        logbook.record(gen=gen, feasible=len(feasible_pop), ensembles_size=len(ensembles), median_gmean=median_gmean,
                       median_mauc=median_mauc, avg_acc2=avg_acc2, **record)
        print(logbook.stream)
    ensemble_classifiers = ensemble_individuals(ensembles, clone(model), x_train, y_train)
    return ensemble_classifiers

In [7]:
if __name__ == "__main__":
    ensemble_classifiers = main(x_train, y_train, model=model, balanced_method='random')
    print("##############################集成分类器的预测结果：################################")
    vote_pred_prob = vote_result_ensembles(ensemble_classifiers, x_test)  # 默认预测结果是软标签
    vote_pred = np.argmax(vote_pred_prob, axis=1)
    gmean, mauc, recall_per_class = calculate_gmean_mauc(vote_pred_prob, y_test)
    acc1, acc2, acc3 = calculate_accuracy(vote_pred, y_test, weights_test)
    print(f"集成分类结果：Recall{recall_per_class}，Gmean：{gmean}，mAUC：{mauc}，Acc1：{acc1}，Acc2：{acc2}，Acc3：{acc3}")
    print("训练已结束！")
    for list in perfomance_per_generation:
        print(list)

gen	feasible	ensembles_size	median_gmean	median_mauc	avg_acc2
1  	4       	4             	0.959053    	0.997234   	0.961536
2  	6       	6             	0.959053    	0.997169   	0.960676
3  	13      	13            	0.961209    	0.997081   	0.962835
4  	18      	18            	0.960809    	0.996698   	0.963743
5  	24      	24            	0.96083     	0.996353   	0.964002
6  	31      	30            	0.96083     	0.996078   	0.965204
7  	34      	30            	0.965525    	0.996353   	0.967493
8  	34      	30            	0.967564    	0.996522   	0.969131
9  	32      	30            	0.967871    	0.996522   	0.96941 
10 	37      	30            	0.971487    	0.996698   	0.97225 
11 	34      	30            	0.972498    	0.996843   	0.972773
12 	42      	30            	0.976604    	0.997124   	0.974991
13 	38      	30            	0.975695    	0.997124   	0.975311
14 	37      	30            	0.978338    	0.997167   	0.97769 
15 	41      	30            	0.979413    	0.997015   	0.980262
16 	33  

In [8]:
import numpy as np

array1=np.array(median_gmean_list)
array2=np.array(median_mauc_list)
# 保存为csv文件
np.savetxt('C:/Users/zsc/Desktop/median_gmean_mile_'+datasetname+'.csv', array1, delimiter=',')
np.savetxt('C:/Users/zsc/Desktop/median_mauc_mile'+datasetname+'.csv', array1, delimiter=',')