# E-MOSAIC

引用自[E. R. Q. Fernandes, A. C. P. L. F. de Carvalho and X. Yao, "Ensemble of Classifiers Based on Multiobjective Genetic Sampling for Imbalanced Data," in IEEE Transactions on Knowledge and Data Engineering, vol. 32, no. 6, pp. 1104-1115, 1 June 2020, doi: 10.1109/TKDE.2019.2898861.]


## 数据集的预处理 

In [3]:
from utils.dataset_utils import get_distribution
from instance_selection.parameter.parameter import *  # 导入参数的设定
from instance_selection.operator.init_toolbox import init_toolbox_emosaic
from instance_selection.operator.metrics import calculate_gmean_mauc, calculate_average_gmean_mauc
from instance_selection.operator.genetic_operator import selTournamentNDCD
from instance_selection.operator.ensemble import vote_result_ensembles, ensemble_individuals
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import scipy.io as sio  # 从.mat文件中读取数据集
import random
from deap import tools
import warnings

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Balance_Scale  # 数据集名称（包含对应的参数配置）
datasetname = DATASET.DATASETNAME.split('.')[0]

# 加载、划分数据集
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET.DATASETNAME)
#datasetname = 'Lymph.mat'
# mat_data = sio.loadmat('../../data/dataset/' + datasetname)
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=RANDOM_SEED)  # 划分数据集
scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
print(datasetname + f' distribution: {counts_all}')
print(f'trainset distribution: {counts_train}')
print(f'testset distribution: {counts_test}')
model = MLPClassifier(hidden_layer_sizes=(DATASET.HIDDEN_SIZE,), max_iter=DATASET.MAX_ITER,
                      random_state=RANDOM_SEED, learning_rate_init=DATASET.LEARNING_RATE)

Balance_Scale distribution: [ 49 288 288]
trainset distribution: [ 31 208 198]
testset distribution: [18 80 90]


## E-MOSAIC

In [11]:
import statistics

toolbox = init_toolbox_emosaic(model, x_train, y_train, n_splits=N_SPLITS, random_seed=RANDOM_SEED)  # 初始化toolbox

perfomance_per_generation = []
median_gmean_list = []
median_mauc_list = []
def main(x_train, y_train, model, balanced_method='balanced'):
    not_replaced = 0
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    logbook = tools.Logbook()
    logbook.header = "gen", "fronts", "ensembles_size", "median_gmean", "median_mauc"
    ####################################种群的初始化###########################
    pop = toolbox.population(n=POPSIZE)  # 个体编码默认全为0
    pop = toolbox.init_population(pop, balanced_method=balanced_method)  # 初始化种群中的个体
    toolbox.evaluate(pop)  # 计算个体的适应度
    save_ensembles = pop  # 保存ensembles为初始种群
    save_gmean, save_mauc = calculate_average_gmean_mauc(pop)
    ensemble_initial = ensemble_individuals(pop, model, x_train, y_train)
    vote_pred_prob = vote_result_ensembles(ensemble_initial, x_test)  # 默认预测结果是软标签
    gmean_init, mauc_init, _ = calculate_gmean_mauc(vote_pred_prob, y_test)
    perfomance_per_generation.append([gmean_init, mauc_init])
    ####################################种群的迭代#################################################
    for gen in range(1, NGEN + 11):
        offspring = selTournamentNDCD(pop, POPSIZE, tournsize=3)  # 锦标赛选择（1、先根据非支配排序的等级2、再根据拥挤距离）
        offspring = [toolbox.clone(ind) for ind in offspring]
        for i in range(0, len(offspring) - 1, 2):
            if random.random() <= CXPB:
                offspring[i], offspring[i + 1] = toolbox.mate(offspring[i], offspring[i + 1])  # 单点交叉
            offspring[i] = toolbox.mutate(offspring[i], MR)[0]  # 二进制反转突变
            offspring[i + 1] = toolbox.mutate(offspring[i + 1], MR)[0]  # 二进制反转突变
            del offspring[i].fitness.values, offspring[i + 1].fitness.values
        #############################################################合并、去重#####################################################
        offspring = toolbox.individuals_constraints(offspring)  # 限制每个类至少有一个实例被选择
        pop = pop + offspring  # 种群的合并
        pop, _ = toolbox.remove_duplicates(pop)  # 去重
        while len(pop) < POPSIZE:  # 保证种群大小为POPSIZE
            add_individual = []
            num_add = POPSIZE - len(pop)
            for i in range(0, num_add):
                index = random.randint(0, len(offspring) - 1)  # 在0-len(offspring)范围内随机产生一个索引
                offspring[index] = toolbox.mutate(offspring[index], MR)[0]  # 选择index对应的个体进行突变
                del offspring[index].fitness.values
                add_individual.append(offspring[index])
            add_individual = toolbox.individuals_constraints(add_individual)  # 限制每个类至少有一个实例被选择
            pop = pop + add_individual  # 种群的合并
            pop, _ = toolbox.remove_duplicates(pop)  # 去重
        pop = toolbox.individuals_constraints(pop) # 限制每个类至少有5个实例被选择
        toolbox.evaluate(pop)  # 计算新种群适应度
        ###############################################得到pareto_fronts############################################
        pop, pareto_fronts = toolbox.select(pop, POPSIZE)
        ensembles = pop  # 集成的个体
        
        ensemble_curr = ensemble_individuals(pop, model, x_train, y_train)
        vote_pred_prob = vote_result_ensembles(ensemble_curr, x_test)  # 默认预测结果是软标签
        gmean_curr, mauc_curr, _ = calculate_gmean_mauc(vote_pred_prob, y_test)
        perfomance_per_generation.append([gmean_curr, mauc_curr])
        
        curr_gmean, curr_mauc = calculate_average_gmean_mauc(ensembles)
        gmean_list = []
        mauc_list = []
        for ind in ensembles:
            gmean_list.append(ind.gmean)
            mauc_list.append(ind.mauc)
        # 求出list的中值
        median_gmean = statistics.median(gmean_list)
        median_mauc = statistics.median(mauc_list)
        median_gmean_list.append(median_gmean)
        median_mauc_list.append(median_mauc)
        if curr_gmean >= save_gmean and curr_mauc >= save_mauc:
            save_ensembles = ensembles
            not_replaced = 0
        else:
            not_replaced += 1
        if not_replaced >= STOP_SIGN:
            break  # 迭代结束
        record = stats.compile(pop)
        logbook.record(gen=gen, fronts=len(pareto_fronts), ensembles_size=len(ensembles),
                       median_gmean=median_gmean, median_mauc=median_mauc, **record)
        print(logbook.stream)
    ensemble_classifiers = ensemble_individuals(save_ensembles, model, x_train, y_train)
    return ensemble_classifiers


if __name__ == "__main__":
    print("*****************算法开始执行：******************")
    ensemble_classifiers = main(x_train, y_train, model=model)
    vote_pred_prob = vote_result_ensembles(ensemble_classifiers, x_test)  # 默认预测结果是软标签
    gmean, mauc, recall_per_class = calculate_gmean_mauc(vote_pred_prob, y_test)
    print(f"Reacll:{recall_per_class}，Gmean：{gmean}，mAUC：{mauc}")
    print("*****************算法执行结束！******************")
    for list in perfomance_per_generation:
        print(list)

*****************算法开始执行：******************
gen	fronts	ensembles_size	median_gmean	median_mauc
1  	11    	30            	0.917894    	0.980109   
2  	8     	30            	0.936983    	0.986448   
3  	7     	30            	0.945345    	0.990155   
4  	6     	30            	0.953568    	0.991135   
5  	7     	30            	0.961631    	0.992633   
6  	6     	30            	0.966962    	0.994497   
7  	6     	30            	0.971116    	0.995718   
8  	6     	30            	0.972345    	0.995939   
9  	6     	30            	0.974706    	0.996453   
10 	7     	30            	0.977472    	0.996647   
11 	6     	30            	0.978619    	0.996549   
12 	6     	30            	0.978441    	0.996738   
13 	6     	30            	0.978815    	0.996837   
14 	6     	30            	0.97988     	0.997453   
15 	6     	30            	0.97988     	0.997851   
16 	7     	30            	0.981195    	0.997958   
17 	7     	30            	0.982505    	0.998085   
18 	7     	30            	0.982505    	

In [12]:
import numpy as np

array1=np.array(median_gmean_list)
array2=np.array(median_mauc_list)
# 保存为csv文件
np.savetxt('C:/Users/zsc/Desktop/median_gmean_emosaic_'+datasetname+'.csv', array1, delimiter=',')
np.savetxt('C:/Users/zsc/Desktop/median_mauc_emosaic'+datasetname+'.csv', array1, delimiter=',')