# E-MOSAIC

引用自[E. R. Q. Fernandes, A. C. P. L. F. de Carvalho and X. Yao, "Ensemble of Classifiers Based on Multiobjective Genetic Sampling for Imbalanced Data," in IEEE Transactions on Knowledge and Data Engineering, vol. 32, no. 6, pp. 1104-1115, 1 June 2020, doi: 10.1109/TKDE.2019.2898861.]


## 数据集的预处理 

In [1]:
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from utils.dataset_utils import get_classes_indexes_counts
import scipy.io as sio  # 从.mat文件中读取数据集
from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

print("#########################加载数据集#########################")
'''
id = 12 balance_scale  
'''
# 数据集
uci_dataset = fetch_ucirepo(id=12)
#mat_data = sio.loadmat('../../data/dataset/USPS.mat') 
# 提取变量
features = uci_dataset.data.features  # 特征数据
targets = uci_dataset.data.targets  # 标签lable
# 将数据由dataframe转换成numpy格式

dataset_x = features.to_numpy()
dataset_y = targets.to_numpy()[:, 0]

# dataset_x = mat_data['X']
# dataset_y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]

# 显示数据集分布
print("特征数据:", dataset_x.shape)
print("label:", dataset_y.shape)

# One-hot encode target variable 强制将类别转换为0-1序列，0表示不是该类，1表示属于该类
encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(dataset_y.reshape(-1, 1))
print("label:", y_onehot.shape)
# 统计每个类别的个数，dataset_y.max()+1是类别的个数
classes, counts = get_classes_indexes_counts(
    np.argmax(y_onehot, axis=1))  #np.argmax(y_onehot, axis=1)找最大值的索引，将0-1序列转化为0,1,2,3......的整数标签
print("每种类别的数量：", counts)

#############################################划分数据集##################################
print("#########################划分数据集#########################")

x_train, x_test, y_train, y_test = train_test_split(dataset_x, y_onehot, test_size=0.3, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

# 显示数据集分布
print("特征数据:", x_train.shape)
print("label:", y_train.shape)

# 统计每个类别的个数 np.argmax(y_train, axis=1) Convert one-hot encoded test labels back to single class labels
classes_train, counts_train = get_classes_indexes_counts(np.argmax(y_train, axis=1))
print("训练集每种类别的数量：", counts_train)

classes_test, counts_test = get_classes_indexes_counts(np.argmax(y_test, axis=1))
print("测试集每种类别的数量：", counts_test)
# 确定每个类别的数量
num_instances = y_train.shape[0]  
print("训练集实例数量:", num_instances)

#########################加载数据集#########################
特征数据: (625, 4)
label: (625,)
label: (625, 3)
每种类别的数量： [ 49 288 288]
#########################划分数据集#########################
特征数据: (437, 4)
label: (437, 3)
训练集每种类别的数量： [ 31 208 198]
测试集每种类别的数量： [18 80 90]
训练集实例数量: 437


## 评价函数
（G-mean,mAUC两个目标）

In [2]:
from scipy.stats import gmean
from sklearn.metrics import precision_score, roc_auc_score, accuracy_score
from scipy.stats import mode


##########################由个体得到选择的实例子集的索引###########################
def get_indices(individual):
    '''
    :param individual: individual（用实值进行编码）
    :return: 被选择实例的索引
    '''
    individual = np.round(individual)  # 数据范围在0-1之间，转化成int的同时会舍去小数部分，从而将个体映射到0-1编码
    indices = np.where(individual == 1)  # 1代表选择该实例，返回值是tuple，tuple[0]取元组中的第一个元素
    return indices[0]


###########################获取实例子集############################
def get_subset(individual):
    '''
    :param individual: 
    :return: 实例子集
    '''
    indices = get_indices(individual)
    x_sub = X_train_scaled[indices, :]
    y_sub = y_train[indices, :]
    return x_sub, y_sub


##########################适应度函数（PPV和PFC，为主要、次要指标）#################################
def fitness_function(individual):
    ######################PPV#######################
    # 使用训练数据进行预测
    index_pred = individual.mlp.predict(X_test_scaled)  # 计算accuracy、PPV
    index_pred_proba = individual.mlp.predict_proba(X_test_scaled)  # 计算mAUC

    # Convert one-hot encoded test labels back to single class labels
    y_test_labels = np.argmax(y_test, axis=1)
    y_pred_labels = np.argmax(index_pred, axis=1)
    _, counts = get_classes_indexes_counts(y_test_labels)
    #print("类型数量",len(counts))
    # 计算每个类别的 Precision
    class_precisions = precision_score(y_test_labels, y_pred_labels, average=None)
    #print("每个类别的PPV：", class_precisions)
    geometric_mean = gmean(class_precisions)
    ######################PFC#######################
    # 7. 计算 ROC AUC（ovo+macro）
    auc_ovo_macro = roc_auc_score(y_test, index_pred_proba, multi_class="ovo", average="macro")
    return round(geometric_mean, 4), round(auc_ovo_macro, 4)


def vote_ensembles(save_ensembles):
    y_pred_labels_ensembles = []
    y_test_labels = np.argmax(y_test, axis=1)
    for ensemble in save_ensembles:
        index_pred = ensemble.predict(X_test_scaled)  # 计算accuracy、PPV
        # Convert one-hot encoded test labels back to single class labels
        y_pred_labels = np.argmax(index_pred, axis=1)
        y_pred_labels_ensembles.append(y_pred_labels)
    # 按列投票，取每列中出现次数最多的类别作为最终分类结果
    final_result = mode(y_pred_labels_ensembles, axis=0, keepdims=False).mode.flatten()

    # 计算准确率
    accuracy = accuracy_score(y_test_labels, final_result)
    print(f'Accuracy: {accuracy:.2f}')

    # 打印分类报告
    print("Classification Report:")
    print(classification_report(y_test_labels, final_result))

    # 打印混淆矩阵
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_labels, final_result))


## NDGA-II

In [3]:

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from instance_selection.nsga_2.genetic_operator import selNSGA2, mutate_binary_inversion, selTournamentDCD
import warnings

warnings.filterwarnings("ignore")  # 忽略警告
from sklearn.neural_network import MLPClassifier

import array
import random
import numpy
from deap import base
from deap import creator
from deap import tools

# 最大化评价目标
creator.create("FitnessMaxAndMax", base.Fitness, weights=(1.0, 1.0))
'''
fitness:适应度：Gmean和mAUC
pfc：每个分类器的成对故障信用，用于评估分类器集合的多样性
'''
creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMaxAndMax, pfc=None, mlp=None)
toolbox = base.Toolbox()

NDIM = num_instances
# 二进制编码
toolbox.register("attr_binary", random.randint, 0, 1)  # 0-1编码
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_binary, n=num_instances)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", fitness_function)

# 单点交叉
toolbox.register("mate", tools.cxOnePoint)
# 二进制突变
toolbox.register("mutate", mutate_binary_inversion)
toolbox.register("select", selNSGA2, x_test=X_test_scaled, y_test=y_test)

init_mlp = MLPClassifier(hidden_layer_sizes=(15,), max_iter=500, random_state=42)


## 种群的迭代

In [5]:

def main(seed=None):
    random.seed(seed)

    NGEN = 30  # 迭代次数
    MU = 40  # 种群数量
    CXPB = 1.0  # 交叉因子/交叉率
    MR = 0.2  # 突变因子/突变率

    ####################################迭代过程的记录###########################
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    # stats.register("avg", numpy.mean, axis=0)
    # stats.register("std", numpy.std, axis=0)
    stats.register("min", numpy.min, axis=0)
    stats.register("max", numpy.max, axis=0)
    logbook = tools.Logbook()
    logbook.header = "gen", "evals", "min", "max"

    ####################################种群的初始化###########################
    pop = toolbox.population(n=MU)

    ####################################计算初始种群的适应度###########################
    ensembles = []  # 当前每个个体对应的mlp模型
    base_estimators = []  # 基学习器
    save_ensembles = []  # 存储每个个体对应的mlp模型
    pop_x_sub = []  # 当前每个个体的实例选择的特征数据
    pop_y_sub = []  # 当前每个个体对应的实例选择的lable
    # 对于每个个体都训练得到一个mlp模型
    for i in range(len(pop)):
        mlp = MLPClassifier(hidden_layer_sizes=(20,), max_iter=800, random_state=42)
        x_sub, y_sub = get_subset(pop[i])
        mlp.fit(x_sub, y_sub)
        ensembles.append(mlp)
        pop_x_sub.append(x_sub)
        pop_y_sub.append(y_sub)
        pop[i].mlp = mlp

    # 由mlp模型得到个体的适应度
    for i in range(len(pop)):
        pop[i].fitness.values = toolbox.evaluate(pop[i])

    #################################计算PFC并进行非支配排序#########################################
    # 计算PFC并进行非支配排序 PFC代替拥挤距离
    pop = toolbox.select(pop, len(pop))

    record = stats.compile(pop)
    logbook.record(gen=0, evals=len(pop), **record)
    print(logbook.stream)
    ####################################种群的迭代###########################
    for gen in range(1, NGEN):
        # 选择
        offspring = selTournamentDCD(pop, len(pop))
        offspring = [toolbox.clone(ind) for ind in offspring]

        # 交叉
        for i in range(0, len(offspring) - 1, 2):
            if random.random() <= CXPB:
                offspring[i], offspring[i + 1] = toolbox.mate(offspring[i], offspring[i + 1])
            # 突变
            offspring[i] = toolbox.mutate(offspring[i], MR)[0]
            offspring[i + 1] = toolbox.mutate(offspring[i + 1], MR)[0]
            del offspring[i].fitness.values, offspring[i + 1].fitness.values

        # 计算新的种群适应度 
        ensembles.clear()
        pop_x_sub.clear()
        pop_y_sub.clear()
        for i in range(len(offspring)):
            mlp = MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000, random_state=42)
            x_sub, y_sub = get_subset(offspring[i])
            mlp.fit(x_sub, y_sub)
            ensembles.append(mlp)
            pop_x_sub.append(x_sub)
            pop_y_sub.append(y_sub)
            offspring[i].mlp = mlp
        for i in range(len(offspring)):
            offspring[i].fitness.values = toolbox.evaluate(offspring[i])

        # Select the next generation population
        pop = toolbox.select(pop + offspring, MU)
        record = stats.compile(pop)
        logbook.record(gen=gen, evals=len(pop), **record)
        print(logbook.stream)
    for ind in pop:
        save_ensembles.append(ind.mlp)
    return pop, logbook, save_ensembles


if __name__ == "__main__":
    pop, stats, ensembles = main()

    print("##############################集成分类器的预测结果：################################")
    vote_ensembles(ensembles)

gen	evals	min            	max            
0  	40   	[0.7815 0.8758]	[0.9846 0.9632]
1  	40   	[0.8336 0.8758]	[0.9846 0.9742]
2  	40   	[0.833  0.8758]	[0.9846 0.9742]
3  	40   	[0.8219 0.9271]	[0.9963 0.9742]
4  	40   	[0.8107 0.9271]	[1.    0.975]  
5  	40   	[0.8695 0.9441]	[1.    0.975]  
6  	40   	[0.8695 0.9569]	[1.    0.975]  
7  	40   	[0.8566 0.9569]	[1.    0.975]  
8  	40   	[0.8566 0.9569]	[1.     0.9764]
9  	40   	[0.8566 0.9716]	[1.     0.9764]
10 	40   	[0.8445 0.9716]	[1.     0.9765]
11 	40   	[0.8445 0.9716]	[1.     0.9765]
12 	40   	[0.8445 0.9716]	[1.     0.9779]
13 	40   	[0.8445 0.9716]	[1.     0.9779]
14 	40   	[0.8445 0.9719]	[1.     0.9779]
15 	40   	[0.8445 0.9719]	[1.     0.9784]


ZeroDivisionError: float division by zero