# E-SEIC
Selection of evolutionary instances with constraints for unbalanced datasets

In [3]:
from utils.dataset_utils import get_distribution, k_fold_cross_validation
from instance_selection.parameter.parameter import *  # 导入参数的设定
from instance_selection_encapsulation.operator.init_toolbox import init_toolbox_eseic
from instance_selection_encapsulation.operator.metrics import calculate_gmean_mauc, calculate_average_accuracy, \
    calculate_average_gmean_mauc, calculate_accuracy
from instance_selection_encapsulation.operator.genetic_operator import selTournamentNDCD
from instance_selection_encapsulation.operator.ensemble import vote_result_ensembles, ensemble_individuals
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
import scipy.io as sio  # 从.mat文件中读取数据集
import random
from deap import tools
import warnings
import numpy as np

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Pen_Digits  # 数据集名称（包含对应的参数配置）

datasetname = DATASET.DATASETNAME.split('.')[0]
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET.DATASETNAME)  # 加载、划分数据集
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y,
                                                    random_state=RANDOM_SEED)  # 划分数据集
scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
print(datasetname + f' distribution: {counts_all}')
print(f'trainset distribution: {counts_train}')
print(f'testset distribution: {counts_test}')

model = MLPClassifier(hidden_layer_sizes=(DATASET.HIDDEN_SIZE,), max_iter=DATASET.MAX_ITER,
                      random_state=RANDOM_SEED, learning_rate_init=DATASET.LEARNING_RATE)

weights_train = (1 / counts_train.astype(float)) / np.sum(1 / counts_train.astype(float))  # 计算每个类的权重，用于计算每个类别的权重
weights_test = (1 / counts_test.astype(float)) / np.sum(1 / counts_test.astype(float))  # 计算每个类的权重，用于计算每个类别的权重

num_instances = int(np.ceil(counts_train.min() * 0.9))  # 取最小数量的类的0.9（向下取整）
print("最小数量:", num_instances)

y_train_pred_proba = k_fold_cross_validation(model=clone(model), X=x_train, y=y_train, n_splits=N_SPLITS - 2,
                                             method='soft',
                                             random_state=RANDOM_SEED)  # 交叉验证得到软标签
# 将概率转化为预测结果
y_train_pred = np.argmax(y_train_pred_proba, axis=1)

Acc1, Acc2, Acc3 = calculate_accuracy(y_train_pred, y_train, weights_train)
constraints = [Acc1, Acc2, Acc3]

Pen_Digits distribution: [9937 1055]
trainset distribution: [6956  738]
testset distribution: [2981  317]
最小数量: 665


## E-SEIC

In [4]:
toolbox = init_toolbox_eseic(model, x_train, y_train, weights_train, constraints, n_splits=N_SPLITS - 2,
                             random_seed=RANDOM_SEED)  # 初始化toolbox


def main(x_train, y_train, model, balanced_method='balanced'):
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    logbook = tools.Logbook()
    logbook.header = "gen", "feasible", "ensembles_size", "avg_gmean", "avg_mauc", "avg_acc2"
    ####################################种群的初始化###########################
    pop = toolbox.population(n=POPSIZE)  # 个体编码默认全为0
    pop = toolbox.init_population(pop, balanced_method=balanced_method)  # 初始化种群中的个体
    toolbox.evaluate(pop)  # 计算个体的适应度
    ####################################种群的迭代#################################################
    for gen in range(1, NGEN + 1):
        offspring = selTournamentNDCD(pop, POPSIZE, tournsize=3)  # 锦标赛选择（1、先根据非支配排序的等级2、再根据拥挤距离）
        offspring = [toolbox.clone(ind) for ind in offspring]
        for i in range(0, len(offspring) - 1, 2):
            if random.random() <= CXPB:
                offspring[i], offspring[i + 1] = toolbox.mate(offspring[i], offspring[i + 1])  # 单点交叉
            offspring[i] = toolbox.mutate(offspring[i], MR)[0]  # 二进制反转突变
            offspring[i + 1] = toolbox.mutate(offspring[i + 1], MR)[0]  # 二进制反转突变
            del offspring[i].fitness.values, offspring[i + 1].fitness.values
        #############################################################合并、去重#####################################################
        offspring = toolbox.individuals_constraints(offspring)  # 限制每个类至少有一个实例被选择
        pop = pop + offspring  # 种群的合并
        pop, _ = toolbox.remove_duplicates(pop)  # 去重
        while len(pop) < POPSIZE:  # 保证种群大小为POPSIZE
            add_individual = []
            num_add = POPSIZE - len(pop)
            for i in range(0, num_add):
                index = random.randint(0, len(offspring) - 1)  # 在0-len(offspring)范围内随机产生一个索引
                offspring[index] = toolbox.mutate(offspring[index], MR)[0]  # 选择index对应的个体进行突变
                del offspring[index].fitness.values
                add_individual.append(offspring[index])
            add_individual = toolbox.individuals_constraints(add_individual)  # 限制每个类至少有一个实例被选择
            pop = pop + add_individual  # 种群的合并
            pop, _ = toolbox.remove_duplicates(pop)  # 去重
        pop = toolbox.individuals_constraints(pop)  # 限制每个类至少有5个实例被选择
        toolbox.evaluate(pop)  # 计算新种群适应度
        ###############################################得到pareto_fronts############################################
        feasible_pop, infeasible_pop = toolbox.get_feasible_infeasible(pop)  # 得到可行解与不可行解
        if len(feasible_pop) >= POPSIZE:
            pop, pareto_fronts = toolbox.select(feasible_pop, POPSIZE)
            # ensembles = pareto_fronts[0]  # pareto_first_front
            ensembles = pop  # pop均为可行解，则集成pop中所有个体
        elif len(feasible_pop) > 0:
            pop = feasible_pop + infeasible_pop[:POPSIZE - len(feasible_pop)]  # 在不可行解中选取违约程度小的个体，保证pop数量为POPSIZE
            # ensembles = tools.sortNondominated(feasible_pop, len(feasible_pop))[0]  # pareto_first_front
            ensembles = feasible_pop  # 只集成可行解
            # ensembles = pop  # 集成种群
        else:
            pop = feasible_pop + infeasible_pop[:POPSIZE - len(feasible_pop)]  # 加入不可行解中违约程度小的个体，保证pop数量为POPSIZE
            ensembles = [infeasible_pop[0]]  # 没有可行解，集成不可行解中第一个个体
            # ensembles = pop  # 集成种群
        avg_gmean, avg_mauc = calculate_average_gmean_mauc(ensembles)  # 计算gmean、mauc的平均值
        _, avg_acc2, _ = calculate_average_accuracy(ensembles)  # 计算acc1、acc2、acc3的平均值

        record = stats.compile(pop)
        logbook.record(gen=gen, feasible=len(feasible_pop), ensembles_size=len(ensembles), avg_gmean=avg_gmean,
                       avg_mauc=avg_mauc, avg_acc2=avg_acc2, **record)
        # print(logbook.stream)
    ensemble_classifiers = ensemble_individuals(ensembles, model, x_train, y_train)
    return ensemble_classifiers


if __name__ == "__main__":
    print("*****************算法开始执行：******************")
    num_run = 40  # 运行次数
    ensembles_results = [[] for _ in range(num_run)]
    for i in range(num_run):
        ensemble_classifiers = main(x_train, y_train, model=model, balanced_method='random')
        for i,classifier in enumerate(ensemble_classifiers):
            ind_pred = classifier.predict(x_test)  # 计算accuracy、PPV
            ind_proba = classifier.predict_proba(x_test)
            gmean, mauc, recall_per_class = calculate_gmean_mauc(ind_proba, y_test)
            # 输出gmean mauc recall_per_class
            print(f"\t第{i+1}个分类器分类结果：Recall{recall_per_class}，Gmean：{gmean}，mAUC：{mauc}")
        vote_pred_prob = vote_result_ensembles(ensemble_classifiers, x_test)  # 默认预测结果是软标签
        vote_pred = np.argmax(vote_pred_prob, axis=1)
        gmean, mauc, recall_per_class = calculate_gmean_mauc(vote_pred_prob, y_test)
        acc1, acc2, acc3 = calculate_accuracy(vote_pred, y_test, weights_test)
        ensembles_results[i] = [gmean, mauc, acc1, acc2, acc3, len(ensemble_classifiers)]
        print(
            f"第{i + 1}次执行：Gmean：{gmean}，mAUC：{mauc}，Acc1：{acc1}，Acc2：{acc2}，Acc3：{acc3}，集成的数量：{len(ensemble_classifiers)}")
    print("*****************算法执行结束！******************")
    ensembles_result_mean = np.mean(ensembles_results, axis=0)
    print(f'集成分类结果（平均值）：{ensembles_result_mean}')

*****************算法开始执行：******************
	第1个分类器分类结果：Recall[0.99932908 0.99369085]，Gmean：0.996506，mAUC：0.999946
	第2个分类器分类结果：Recall[0.99932908 0.99053628]，Gmean：0.994923，mAUC：0.999978
	第3个分类器分类结果：Recall[0.99932908 0.99053628]，Gmean：0.994923，mAUC：0.999932
	第4个分类器分类结果：Recall[0.99899363 0.99053628]，Gmean：0.994756，mAUC：0.999958
	第5个分类器分类结果：Recall[0.99966454 0.98422713]，Gmean：0.991916，mAUC：0.999969
	第6个分类器分类结果：Recall[0.99899363 0.99369085]，Gmean：0.996339，mAUC：0.999944
	第7个分类器分类结果：Recall[0.99966454 0.99053628]，Gmean：0.99509，mAUC：0.999953
	第8个分类器分类结果：Recall[0.99932908 0.98422713]，Gmean：0.991749，mAUC：0.999957
	第9个分类器分类结果：Recall[0.99966454 0.98107256]，Gmean：0.990325，mAUC：0.999972
	第10个分类器分类结果：Recall[0.99899363 0.99369085]，Gmean：0.996339，mAUC：0.999983
	第11个分类器分类结果：Recall[0.99899363 1.        ]，Gmean：0.999497，mAUC：0.999997
	第12个分类器分类结果：Recall[0.99966454 0.9873817 ]，Gmean：0.993504，mAUC：0.999952
	第13个分类器分类结果：Recall[0.99966454 0.99369085]，Gmean：0.996673，mAUC：0.99999
	第14个分类器分类结果：Recall[0.99832271 0

## 写入到Excel 

In [5]:
from utils.excel_utils import save_to_excel_2

columns = ['Gmean', 'MAUC', 'Acc1', 'Acc2', 'Acc3', 'num_ensemble']

save_path = 'C:/Users/Lenovo/Desktop/Stratified/7-3/MILE/' + datasetname + '/'
filename = datasetname
save_to_excel_2(save_path, filename, columns, ensembles_results)

Excel 文件已保存至: C:/Users/Lenovo/Desktop/Stratified/7-3/MILE/Pen_Digits/Pen_Digits.xlsx


'C:/Users/Lenovo/Desktop/Stratified/7-3/MILE/Pen_Digits/Pen_Digits.xlsx'