### 数据集构建

In [23]:
from sklearn.model_selection import train_test_split
from utils.dataset_utils import get_classes_indexes_counts
import scipy.io as sio  # 从.mat文件中读取数据集
import numpy as np

################################################################加载数据集################################################
# 数据集
mat_data = sio.loadmat('../../data/dataset/LedDisplay.mat')
# 提取变量
dataset_x = mat_data['X']
dataset_y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
# 显示数据集分布
print("特征数据:", dataset_x.shape)
print("label:", dataset_y.shape)
# 统计每个类别的个数，dataset_y.max()+1是类别的个数
classes, counts = get_classes_indexes_counts(dataset_y)
print("每种类别的数量：", counts)

特征数据: (1000, 24)
label: (1000,)
每种类别的数量： [103  89 116 114 116  86 107  97  83  89]


###  划分数据集

In [24]:
x_train, x_test, y_train, y_test = train_test_split(dataset_x, dataset_y, test_size=0.3, random_state=42)
# 显示数据集分布
print("特征数据:", x_train.shape)
print("label:", y_train.shape)
# 统计每个类别的个数
classes_train, counts_train = get_classes_indexes_counts(y_train)
print("每种类别的数量：", counts_train)

特征数据: (700, 24)
label: (700,)
每种类别的数量： [67 68 79 79 81 63 71 63 63 66]


### 训练集的构建
1.首先找出类别数量最小的类
2.取最小类别数量的90%，作为平衡数据集的各类别原始数量
3.在每个类别中随机抽取实例构成初始数据集

In [25]:
# 确定每个类别的数量
num_instances = int(counts_train.min() * 0.9)  # 向下取整
print("最小数量:", num_instances)

# 在每个类别中随机的选择该数量的实例的索引
balanced_classes = np.array([])
for indexes in classes_train:
    random_selecte_indices = np.random.choice(indexes, size=num_instances, replace=False)
    balanced_classes = np.hstack((balanced_classes, random_selecte_indices))
balanced_classes = np.sort(balanced_classes).astype(int)

# 得到平衡的数据集
balanced_dataset_x = []
balanced_dataset_y = np.array([])
for index in balanced_classes:
    balanced_dataset_x.append(x_train[index])
    balanced_dataset_y = np.hstack((balanced_dataset_y, y_train[index]))
balanced_dataset_x = np.array(balanced_dataset_x)
balanced_dataset_y = np.array(balanced_dataset_y).astype(int)

# 显示数据集分布
print("平衡的数据集的特征数据:", balanced_dataset_x.shape)
print("label:", balanced_dataset_y.shape)

# 统计每个类别的个数
classes_balanced_dataset, counts_balanced_dataset = get_classes_indexes_counts(balanced_dataset_y)
print("平衡的数据集中每种类别的数量：", counts_balanced_dataset)

最小数量: 56
平衡的数据集的特征数据: (560, 24)
label: (560,)
平衡的数据集中每种类别的数量： [56 56 56 56 56 56 56 56 56 56]


### 单独训练

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# 数据标准化
scaler = StandardScaler()
x_train_single = scaler.fit_transform(balanced_dataset_x)
x_test_single = scaler.transform(x_test)
# 构建并训练MLP模型
mlp = MLPClassifier(hidden_layer_sizes=(10, 20), max_iter=1000, random_state=42)
mlp.fit(x_train_single, balanced_dataset_y)

# 预测和评估模型
y_pred = mlp.predict(x_test_single)

# 输出结果
print("准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:\n", classification_report(y_test, y_pred))

### NSGAII算法

### 算法的实现

In [85]:
from sklearn.metrics import confusion_matrix


#####################################################
def get_indices(individual):
    '''
    :param individual: individual（用实值进行编码）
    :return: 被选择实例的索引
    '''
    individual = np.round(individual)  # 数据范围在0-1之间，转化成int的同时会舍去小数部分，从而将个体映射到0-1编码
    indices = np.where(individual == 1)  # 1代表选择该实例，返回值是tuple，tuple[0]取元组中的第一个元素
    return indices[0]


#######################################################

def get_subset(individual):
    indices = get_indices(individual)
    x_sub = balanced_dataset_x[indices, :]
    y_sub = balanced_dataset_y[indices]
    return x_sub, y_sub


###########################################################

def fitness_function(x_sub, y_sub, ensembles, index):
    f1 = ensembles[index].fit(x_sub)
    # 使用测试数据进行预测
    y_pred = ensembles[index].predict(x_sub)

    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(y_sub, y_pred).ravel()
    f1 = 1.0 * tp / (tp + fp)
    sum = 0.0
    for i in range(len(ensembles)):
        if i != index:
            # 计算两个数组中索引对应的元素值不相等的个数
            y_pred_i = ensembles[i].predict(x_sub)
            tn_i, fp_i, fn_i, tp_i = confusion_matrix(y_sub, y_pred_i).ravel()
            count = sum(1 for a, b in zip(y_pred, y_pred_i) if a != b)
            sum = sum + count / (fp + fn + fp_i + fn_i)
    f2 = sum / len(ensembles)
    return f1, f2


### 多目标评价

In [86]:
import array
import random
import numpy
from deap import base
from deap import benchmarks
from deap.benchmarks.tools import diversity, convergence, hypervolume
from deap import creator
from deap import tools

creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0))
creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMin)
toolbox = base.Toolbox()

# Problem definition

BOUND_LOW, BOUND_UP = 0.0, 1.0

NDIM = 30


def uniform(low, up, size=None):
    try:
        return [random.uniform(a, b) for a, b in zip(low, up)]
    except TypeError:
        return [random.uniform(a, b) for a, b in zip([low] * size, [up] * size)]


toolbox.register("attr_float", uniform, BOUND_LOW, BOUND_UP, NDIM)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.attr_float)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", benchmarks.zdt1)
toolbox.register("evaluate", fitness_function)
# toolbox.register("mate", tools.cxSimulatedBinaryBounded, low=BOUND_LOW, up=BOUND_UP, eta=20.0)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutPolynomialBounded, low=BOUND_LOW, up=BOUND_UP, eta=20.0, indpb=1.0 / NDIM)
# toolbox.register("select", tools.selNSGA2)
toolbox.register("select", tools.selTournament, tournsize=3)


def main(seed=None):
    random.seed(seed)

    NGEN = 250
    MU = 30
    CXPB = 0.9

    ####################################迭代过程的记录###########################
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    # stats.register("avg", numpy.mean, axis=0)
    # stats.register("std", numpy.std, axis=0)
    stats.register("min", numpy.min, axis=0)
    stats.register("max", numpy.max, axis=0)
    logbook = tools.Logbook()
    logbook.header = "gen", "evals", "min", "max"

    ####################################种群的初始化###########################
    pop = toolbox.population(n=MU)

    ####################################计算初始种群的适应度###########################
    ensembles = []  # 当前每个个体对应的mlp模型
    save_ensembles = []  # 存储每个个体对应的mlp模型
    pop_x_sub = [] # 当前每个个体的实例选择的特征数据
    pop_y_sub = [] # 当前每个个体对应的实例选择的lable
    for i in range(len(pop)):
        mlp = MLPClassifier(hidden_layer_sizes=(10, 20), max_iter=1000, random_state=42)
        x_sub, y_sub = get_subset(pop[i])
        mlp.fit(x_sub, y_sub)
        ensembles.append(mlp)
        pop_x_sub.append(x_sub)
        pop_y_sub.append(y_sub)
    save_ensembles = ensembles
    for i in range(len(pop)):
        pop.fitness.values[i] = toolbox.evaluate(pop_x_sub[i], pop_y_sub[i], ensembles)
    record = stats.compile(pop)
    logbook.record(gen=0, evals=len(pop), **record)
    print(logbook.stream)
    ####################################种群的迭代###########################
    for gen in range(1, NGEN):
        # 选择
        offspring = toolbox.select(pop, len(pop))
        offspring = [toolbox.clone(ind) for ind in offspring]

        # 交叉
        for i in range(0, len(offspring) - 1, 2):
            if random.random() <= CXPB:
                offspring[i], offspring[i + 1] = toolbox.mate(offspring[i], offspring[i + 1])
            # 变异
            offspring[i] = toolbox.mutate(offspring[i])[0]
            offspring[i + 1] = toolbox.mutate(offspring[i + 1])[0]
            del offspring[i].fitness.values, offspring[i + 1].fitness.values

        # Evaluate the individuals with an invalid fitness
        fitnesses = toolbox.map(toolbox.evaluate, offspring)
        for ind, fit in zip(offspring, fitnesses):
            ind.fitness.values = fit

        for i in range(len(pop)):
            mlp = MLPClassifier(hidden_layer_sizes=(10, 20), max_iter=1000, random_state=42)
            x_sub, y_sub = get_subset(pop[i])
            mlp.fit(x_sub, y_sub)
            ensembles.append(mlp)
            pop_x_sub.append(x_sub)
            pop_y_sub.append(y_sub)
        for i in range(len(pop)):
            pop.fitness.values[i] = toolbox.evaluate(pop_x_sub[i], pop_y_sub[i], ensembles)

        # Select the next generation population
        pop = toolbox.select(pop + offspring, MU)
        record = stats.compile(pop)
        logbook.record(gen=gen, evals=len(pop), **record)
        print(logbook.stream)

    print("Final population hypervolume is %f" % hypervolume(pop, [11.0, 11.0]))

    return pop, logbook


if __name__ == "__main__":
    pop, stats = main()


TypeError: fitness_function() missing 3 required positional arguments: 'y_sub', 'ensembles', and 'index'

In [73]:
def swap(a, b):
    return b, a


list1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
print(list1)
for ind1, ind2 in zip(list1[::2], list1[1::2]):
    ind1, ind2 = ind1 + 1, ind2 + 2
print(list1)            

[1, 2, 3, 4, 5, 6, 7, 8, 9]
[1, 2, 3, 4, 5, 6, 7, 8, 9]
