# E-MOSAIC

引用自[E. R. Q. Fernandes, A. C. P. L. F. de Carvalho and X. Yao, "Ensemble of Classifiers Based on Multiobjective Genetic Sampling for Imbalanced Data," in IEEE Transactions on Knowledge and Data Engineering, vol. 32, no. 6, pp. 1104-1115, 1 June 2020, doi: 10.1109/TKDE.2019.2898861.]


## 数据集的预处理 

In [1]:
from instance_selection.e_mosaic.fitness import calculate_gmean_mauc
from sklearn.preprocessing import StandardScaler
import numpy as np
from utils.dataset_utils import get_classes_indexes_counts, k_fold_cross_validation
import scipy.io as sio  # 从.mat文件中读取数据集
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone

# 随机种子
random_seed = 43

print("#########################加载数据集#########################")
# Nursery(20、100、0.1)、Satellite(15、100、0.1)、Contraceptive(15、200、0.1) Chess(20、200、0.1)
datasetname = 'Chess2.mat'
mat_data = sio.loadmat('../../data/dataset/' + datasetname)

dataset_x = mat_data['X']
dataset_y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
print("特征数据:", dataset_x.shape, "label:", dataset_y.shape)  # 显示数据集分布
classes, counts = get_classes_indexes_counts(dataset_y)  # 统计每个类别的个数
print("每种类别的分布：", counts)

print("#########################划分数据集#########################")
x_train, x_test, y_train, y_test = train_test_split(dataset_x, dataset_y, test_size=0.3, random_state=random_seed)
# 数据的标准化
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
print("特征数据:", x_train.shape, "label:", y_train.shape)  # 显示数据集分布
classes_train, counts_train = get_classes_indexes_counts(y_train)  # 统计每个类别的个数 
print("训练集每种类别的分布：", counts_train)
classes_test, counts_test = get_classes_indexes_counts(y_test)
print("测试集每种类别的分布：", counts_test)

print("#########################平衡数据集#########################")
num_instances = int(np.ceil(counts_train.min() * 0.9))  # 取最小数量的类的0.9（向下取整）
print("最小数量:", num_instances)

# MLP
learning_rate = 0.1  # 学习率
hidden_size = 20  # 隐含层神经元数量
max_iter = 200  # 最大迭代次数（epoch）
n_splits = 5
# 计算三个目标值初始值
mlp = MLPClassifier(hidden_layer_sizes=(hidden_size,), max_iter=max_iter, random_state=random_seed,
                    learning_rate_init=learning_rate)
y_train_pred_proba = k_fold_cross_validation(model=mlp, X=x_train, y=y_train, n_splits=n_splits, method='soft',
                                             random_state=random_seed)  # 交叉验证得到软标签
# 将概率转化为预测结果
y_train_pred = np.argmax(y_train_pred_proba, axis=1)

gmean_train, mauc_train, _ = calculate_gmean_mauc(y_train_pred_proba, y_train)
print("gmean_train:", gmean_train, "mauc_train:", mauc_train)

mlp_model = clone(mlp)
mlp_model.fit(x_train, y_train)
y_test_pred_proba = mlp_model.predict_proba(x_test)
gmean_test, mauc_test, _ = calculate_gmean_mauc(y_test_pred_proba, y_test)
print("gmean_test:", gmean_test, "mauc_test:", mauc_test)

#########################加载数据集#########################
特征数据: (28056, 6) label: (28056,)
每种类别的分布： [2796 1433 2854 2166  471  198 4553 1712   78  683  592  390 1985 4194
   81 3597  246   27]
#########################划分数据集#########################
特征数据: (19639, 6) label: (19639,)
训练集每种类别的分布： [1931 1017 1987 1528  307  131 3164 1182   59  494  405  268 1373 2973
   57 2570  174   19]
测试集每种类别的分布： [ 865  416  867  638  164   67 1389  530   19  189  187  122  612 1221
   24 1027   72    8]
#########################平衡数据集#########################
最小数量: 18
gmean_train: 0.2941 mauc_train: 0.9075
gmean_test: 0.0 mauc_test: 0.9004


## NSGA-II

In [2]:
from instance_selection.e_mosaic.duplicate_process import find_duplicates, remove_duplicates
from instance_selection.e_mosaic.fitness import fitness_function
from instance_selection.e_mosaic.pop_init import init_by_one_or_zero, init_population_for_balanced_dataset, \
    init_population_for_balanced_dataset_2
from instance_selection.e_mosaic.genetic_operator import selNSGA2, mutate_binary_inversion, selTournamentDCD
import warnings

import array
import random
from deap import base
from deap import creator
from deap import tools

warnings.filterwarnings("ignore")  # 忽略警告
# 基本参数
x_init_train = x_train  # 特征数据初始化
y_init_train = y_train  # 标签初始化
NDIM = len(y_init_train)  # 个体基因长度
# 最大化评价目标
creator.create("FitnessMaxAndMax", base.Fitness, weights=(1.0, 1.0))
creator.create("Individual", array.array, typecode='i', fitness=creator.FitnessMaxAndMax, pfc=None, model=None,
               y_sub_and_pred_proba=None, gmean=None, mauc=None)
toolbox = base.Toolbox()
toolbox.register("attr_binary", init_by_one_or_zero, binary=0)  # 0-1编码，基因全部初始化编码为0或1
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_binary, n=NDIM)  # 个体初始化
toolbox.register("balanced_dataset_for_population", init_population_for_balanced_dataset_2, y_train=y_init_train,
                 ratio=0.9, show_details=False)  # 初始化为平衡数据集（实例个数为min*0.9）
toolbox.register("population", tools.initRepeat, list, toolbox.individual)  # 种群初始化
toolbox.register("evaluate", fitness_function)  # 评价函数
toolbox.register("mate", tools.cxOnePoint)  # 交叉
toolbox.register("mutate", mutate_binary_inversion)  # 二进制突变
#toolbox.register("select", selNSGA2, x_test=x_test, y_test=y_test)  # NSGA-II选择（非支配排序后）
toolbox.register("select", tools.selNSGA2)  # NSGA-II选择（非支配排序后）
toolbox.register("find_duplicates", find_duplicates)  # 找到种群中重复个体的索引对
toolbox.register("remove_duplicates", remove_duplicates)  # 去重

## 种群的迭代

In [3]:
from instance_selection.e_mosaic.genetic_operator import selTournamentNDCD
from instance_selection.e_mosaic.ensemble_operator import vote_ensembles, calculate_gmean_mauc, \
    ensembles_individuals_gmean_mauc
from utils.dataset_utils import get_subset, k_fold_cross_validation
from sklearn.base import clone


def main(random_seed, model, n_splits):
    NGEN = 40  # 迭代次数
    POPSIZE = 40  # 种群数量
    CXPB = 1.0  # 交叉因子/交叉率
    MR = 0.2  # 突变因子/突变率

    # 计算适应度，同时会保存训练好的mlp模型
    def get_fitness_values(individual):
        x_sub, y_sub = get_subset(individual, x_init_train, y_init_train)
        # 用实例选择的子集训练模型
        model_clone = clone(model)
        model_clone.fit(x_sub, y_sub)
        individual.model = model_clone
        y_pred_proba = k_fold_cross_validation(model=model, X=x_sub, y=y_sub, n_splits=n_splits, method='soft',
                                               random_state=random_seed)  # 交叉验证得到软标签
        individual.y_sub_and_pred_proba = (y_sub, y_pred_proba)  # 保存个体的软标签和预测概率
        individual.gmean, individual.mauc, _ = calculate_gmean_mauc(y_pred_proba, y_sub)  # 计算个体的gmean和mauc
        individual.fitness.values = toolbox.evaluate(individual)  # 计算个体的目标值

    ####################################迭代过程的记录#############################
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    logbook = tools.Logbook()
    logbook.header = "gen", "fronts", "ensembles_size", "avg_gmean", "avg_mauc"
    ####################################种群的初始化###########################
    pop = toolbox.population(n=POPSIZE)
    pop = toolbox.balanced_dataset_for_population(pop)  # 平衡种群中的个体
    ####################################计算初始种群的适应度###########################
    ensembles = []  # 用于集成当前每个个体对应的mlp模型
    for i in range(len(pop)):
        get_fitness_values(pop[i])
    ####################################种群的迭代#################################################
    for gen in range(1, NGEN + 1):
        offspring = selTournamentNDCD(pop, POPSIZE, tournsize=3)  # 锦标赛选择（1、先根据非支配排序的等级2、再根据拥挤距离）
        offspring = [toolbox.clone(ind) for ind in offspring]
        for i in range(0, len(offspring) - 1, 2):
            if random.random() <= CXPB:
                offspring[i], offspring[i + 1] = toolbox.mate(offspring[i], offspring[i + 1])  # 单点交叉
            offspring[i] = toolbox.mutate(offspring[i], MR)[0]  # 二进制反转突变
            offspring[i + 1] = toolbox.mutate(offspring[i + 1], MR)[0]  # 二进制反转突变
            del offspring[i].fitness.values, offspring[i + 1].fitness.values
        #############################################################合并、去重#####################################################
        pop = pop + offspring  # 种群的合并
        duplicates = toolbox.find_duplicates(pop)
        pop, num_duplicates = toolbox.remove_duplicates(pop, duplicates)  # 去重
        while len(pop) < POPSIZE:
            add_individual = []
            num_add = POPSIZE - len(pop)
            for i in range(0, num_add):
                index = random.randint(0, len(offspring) - 1)  # 在0-len(offspring)范围内随机产生一个索引
                offspring[index] = toolbox.mutate(offspring[index], MR)[0]  # 选择index对应的个体进行突变
                del offspring[index].fitness.values
                add_individual.append(offspring[index])
            pop = pop + add_individual  # 种群的合并
            add_duplicates = toolbox.find_duplicates(pop)  # 找到重复个体的索引对
            pop, _ = toolbox.remove_duplicates(pop, add_duplicates)  # 去重
        # 计算新种群适应度 
        for i in range(len(pop)):
            if not pop[i].fitness.valid:
                get_fitness_values(pop[i])
        ###############################################得到pareto_fronts############################################
        pop = toolbox.select(pop, POPSIZE)
        pareto_fronts = tools.sortNondominated(pop, len(pop))

        ensembles_individuals = pop
        avg_gmean, avg_mauc = ensembles_individuals_gmean_mauc(ensembles_individuals)
        record = stats.compile(pop)
        logbook.record(gen=gen, fronts=len(pareto_fronts), ensembles_size=len(ensembles_individuals),
                       avg_gmean=avg_gmean, avg_mauc=avg_mauc, **record)
        print(logbook.stream)
    # 对最后的结果进行集成
    ensembles.clear()
    for ind in ensembles_individuals:
        ensembles.append(ind.model)
    return pop, stats, ensembles, ensembles_individuals


if __name__ == "__main__":
    pop, stats, ensembles, ensembles_individuals = main(random_seed=random_seed, model=mlp, n_splits=n_splits)
    print("##############################集成分类器的预测结果：################################")
    g_mean, m_auc, recall_per_class = vote_ensembles(ensembles, x_test, y_test, show_result=True)
    print(f"最终的集成分类结果：Recall_Per_Class{recall_per_class}，Gmean：{g_mean}，mAUC：{m_auc}")
    print("训练已结束！")

gen	fronts	ensembles_size	avg_gmean	avg_mauc
1  	8     	40            	0.2437   	0.9104  
2  	7     	40            	0.2649   	0.9119  
3  	7     	40            	0.2754   	0.9124  
4  	6     	40            	0.2948   	0.9133  
5  	8     	40            	0.3098   	0.9139  
6  	7     	40            	0.3224   	0.9144  
7  	7     	40            	0.3232   	0.915   
8  	7     	40            	0.3249   	0.9156  
9  	7     	40            	0.3276   	0.9156  
10 	7     	40            	0.3285   	0.9159  
11 	7     	40            	0.3285   	0.9159  
12 	7     	40            	0.3298   	0.916   
13 	7     	40            	0.329    	0.9162  
14 	7     	40            	0.3316   	0.9159  
15 	7     	40            	0.3325   	0.916   
16 	7     	40            	0.3325   	0.916   
17 	6     	40            	0.342    	0.9162  
18 	6     	40            	0.3425   	0.9164  
19 	6     	40            	0.3427   	0.9164  
20 	6     	40            	0.3424   	0.9165  
21 	6     	40            	0.3421   	0.9166  
22 	7     