### 测试

In [1]:
from instance_selection.parameter.parameter import *  # 导入参数的设定
from instance_selection.operator.init_toolbox import init_toolbox
from instance_selection.operator.metrics import calculate_accuracy, calculate_gmean_mauc
from instance_selection.operator.genetic_operator import selTournamentNDCD
from instance_selection.operator.ensemble import vote_result_ensembles, calculate_average_gmean_mauc
from sklearn.preprocessing import StandardScaler
import numpy as np
from utils.dataset_utils import get_distribution
import scipy.io as sio  # 从.mat文件中读取数据集
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
import warnings
import random
from deap import tools

from utils.dataset_utils import get_subset, k_fold_cross_validation

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Satellite  # 数据集名称（包含对应参数的字典形式）
datasetname = DATASET['DATASETNAME'].split('.')[0]

# 加载、划分数据集
mat_data = sio.loadmat('../../data/dataset/' + DATASET['DATASETNAME'])
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=RANDOM_SEED)  # 划分数据集
scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
print(datasetname + f' distribution: {counts_all}')
print(f'trainset distribution: {counts_train}')
print(f'testset distribution: {counts_test}')

weights_train = (1 / counts_train.astype(float)) / np.sum(1 / counts_train.astype(float))  # 计算每个类的权重，用于计算每个类别的权重
model = MLPClassifier(hidden_layer_sizes=(DATASET['HIDDEN_SIZE'],), max_iter=DATASET['MAX_ITER'],
                      random_state=RANDOM_SEED, learning_rate_init=DATASET['LEARNING_RATE'])
y_train_pred_proba = k_fold_cross_validation(model=model, X=x_train, y=y_train, n_splits=N_SPLITS, method='soft',
                                             random_state=RANDOM_SEED)  # 交叉验证得到软标签
y_train_pred = np.argmax(y_train_pred_proba, axis=1)
Acc1, Acc2, Acc3 = calculate_accuracy(y_train_pred, y_train, weights_train)

Satellite distribution: [1533  703 1358  626  707 1508]
trainset distribution: [1083  517  942  425  488 1049]
testset distribution: [450 186 416 201 219 459]


In [None]:

toolbox = init_toolbox(y_train)  # 初始化toolbox


# 计算适应度，同时会保存训练好的mlp模型
def get_fitness_values(individual):
    x_sub, y_sub = get_subset(individual, x_train, y_train)
    y_pred_proba = k_fold_cross_validation(model=clone(model), X=x_sub, y=y_sub, n_splits=N_SPLITS, method='soft',
                                           random_state=RANDOM_SEED)  # 交叉验证得到软标签
    individual.y_sub_and_pred_proba = (y_sub, y_pred_proba)  # 保存个体的软标签和预测概率
    individual.gmean, individual.mauc, _ = calculate_gmean_mauc(y_pred_proba, y_sub)  # 计算个体的gmean和mauc
    individual.fitness.values = toolbox.evaluate(individual)  # 计算个体的目标值


def main(model, balanced_method='balanced'):
    ####################################迭代过程的记录#############################
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    logbook = tools.Logbook()
    logbook.header = "gen", "fronts", "ensembles_size", "avg_gmean", "avg_mauc"
    ####################################种群的初始化###########################
    pop = toolbox.population(n=POPSIZE)  # 个体编码默认全为0
    pop = toolbox.init_population(pop, balanced_method=balanced_method)  # 初始化种群中的个体
    for i in range(len(pop)):
        get_fitness_values(pop[i])  # 计算个体的适应度

    ensembles = []  # 用于集成当前每个个体对应的mlp模型
    ####################################种群的迭代#################################################
    for gen in range(1, NGEN + 1):
        offspring = selTournamentNDCD(pop, POPSIZE, tournsize=3)  # 锦标赛选择（1、先根据非支配排序的等级2、再根据拥挤距离）
        offspring = [toolbox.clone(ind) for ind in offspring]
        for i in range(0, len(offspring) - 1, 2):
            if random.random() <= CXPB:
                offspring[i], offspring[i + 1] = toolbox.mate(offspring[i], offspring[i + 1])  # 单点交叉
            offspring[i] = toolbox.mutate(offspring[i], MR)[0]  # 二进制反转突变
            offspring[i + 1] = toolbox.mutate(offspring[i + 1], MR)[0]  # 二进制反转突变
            del offspring[i].fitness.values, offspring[i + 1].fitness.values
        #############################################################合并、去重#####################################################
        pop = pop + offspring  # 种群的合并
        pop, _ = toolbox.remove_duplicates(pop)  # 去重
        while len(pop) < POPSIZE:
            add_individual = []
            num_add = POPSIZE - len(pop)
            for i in range(0, num_add):
                index = random.randint(0, len(offspring) - 1)  # 在0-len(offspring)范围内随机产生一个索引
                offspring[index] = toolbox.mutate(offspring[index], MR)[0]  # 选择index对应的个体进行突变
                del offspring[index].fitness.values
                add_individual.append(offspring[index])
            pop = pop + add_individual  # 种群的合并
            pop, _ = toolbox.remove_duplicates(pop)  # 去重
        # 计算新种群适应度
        for i in range(len(pop)):
            if not pop[i].fitness.valid:
                get_fitness_values(pop[i])
        ###############################################得到pareto_fronts############################################
        pop = toolbox.select(pop, POPSIZE)
        pareto_fronts = tools.sortNondominated(pop, len(pop))

        # ensembles_individuals = pareto_fronts[0]
        ensembles_individuals = pop
        record = stats.compile(pop)
        avg_gmean, avg_mauc = calculate_average_gmean_mauc(ensembles_individuals)
        logbook.record(gen=gen, fronts=len(pareto_fronts), ensembles_size=len(ensembles_individuals),
                       avg_gmean=avg_gmean, avg_mauc=avg_mauc, **record)
        print(logbook.stream)
    # 对最后的结果进行集成
    ensembles.clear()
    for ind in ensembles_individuals:
        x_sub, y_sub = get_subset(ind, x_train, y_train)
        # 用实例选择的子集训练模型
        model_clone = clone(model)
        model_clone.fit(x_sub, y_sub)
        ind.model = model_clone
        ensembles.append(ind.model)
    return stats, ensembles


if __name__ == "__main__":

    # run
    run_num = 10
    # 记录的值
    ensembles_result_balanced = [[] for _ in range(run_num)]
    print("###############################平衡#######################################")
    for i in range(run_num):
        print(f"第{i + 1}次执行")
        stats, ensembles = main(model=model)
        vote_pred_prob = vote_result_ensembles(ensembles, x_test)  # 默认是软标签
        g_mean, m_auc, recall_per_class = calculate_gmean_mauc(vote_pred_prob, y_test)
        ensembles_result_balanced[i] = [g_mean, m_auc]
        print(f"最终的集成分类结果：Recall_Per_Class{recall_per_class}，Gmean：{g_mean}，mAUC：{m_auc}")
    print("训练已结束！")
    ensembles_result_mean_balanced = np.mean(ensembles_result_balanced, axis=0)

###############################平衡#######################################
第1次执行
gen	fronts	ensembles_size	avg_gmean	avg_mauc
1  	11    	40            	0.850139 	0.978042
2  	10    	40            	0.852467 	0.978795
3  	9     	40            	0.853291 	0.978892
4  	10    	40            	0.854102 	0.979157
5  	10    	40            	0.854264 	0.979268
6  	8     	40            	0.854928 	0.979404
7  	7     	40            	0.855058 	0.97971 
8  	7     	40            	0.855793 	0.979799
9  	7     	40            	0.85593  	0.98004 
10 	7     	40            	0.855954 	0.980227
11 	8     	40            	0.856768 	0.980213
12 	8     	40            	0.857277 	0.980368
13 	7     	40            	0.857409 	0.980493
14 	7     	40            	0.857751 	0.9805  
15 	7     	40            	0.857644 	0.980821


### 绘制ensemble_gmean_mauc_acc2折线图

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

# Example data
data1 = np.array(ensembles_result_balanced).T

# Define markers and colors
markers = ['o', '^']  # 'o' for data1, '^' for data2
datasets = [data1]
colors1 = plt.cm.viridis(np.linspace(0, 1, len(data1)))

plt.figure(figsize=(12, 8))

# Plot Dataset 1
for i, (curve, color) in enumerate(zip(data1, colors1)):
    x = np.arange(1, len(curve) + 1)
    y = curve
    avg = np.mean(y)
    plt.plot(x, y, marker=markers[0], label=f'Random - Curve {i + 1} (Avg: {avg:.4f})', color=color)
    for xi, yi in zip(x, y):
        plt.text(xi, yi, f'{yi}', fontsize=9, ha='center', va='bottom', color=color)

# Plot settings
plt.title('Curves from Two 2D Numpy Arrays with Point Values and Averages', fontsize=14)
plt.xlabel('Number of iterations', fontsize=12)
plt.ylabel('Value', fontsize=12)
plt.legend(fontsize=10)
plt.grid(alpha=0.5)
plt.tight_layout()

save_path = 'C:\\Users\\zsc\\Desktop\\evolution computation\\meeting\\2024.12.31\\e-mosaic\\' + datasetname.split('.')[
    0] + '\\'
# 创建文件夹（如果不存在）
folder = os.path.dirname(save_path)
if not os.path.exists(folder):
    os.makedirs(folder)
# Show plot
plt.savefig(save_path + datasetname.split('.')[0] + f'ensemble_gmean_mauc_acc2.jpg', dpi=300, bbox_inches='tight')

### 绘制init_gmean_mauc折线图折线图

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

# Example data
data1 = np.array(geam_mauc_random).T
data2 = np.array(geam_mauc_balanced).T

# Define markers and colors
markers = ['o', '^']  # 'o' for data1, '^' for data2
datasets = [data1, data2]
colors1 = plt.cm.viridis(np.linspace(0, 1, len(data1)))
colors2 = plt.cm.viridis(np.linspace(0, 1, len(data2)))

plt.figure(figsize=(12, 8))

# Plot Dataset 1
for i, (curve, color) in enumerate(zip(data1, colors1)):
    x = np.arange(1, len(curve) + 1)
    y = curve
    avg = np.mean(y)
    plt.plot(x, y, marker=markers[0], label=f'Random - Curve {i + 1} (Avg: {avg:.4f})', color=color)
    for xi, yi in zip(x, y):
        plt.text(xi, yi, f'{yi}', fontsize=9, ha='center', va='bottom', color=color)

# Plot Dataset 2
for i, (curve, color) in enumerate(zip(data2, colors2)):
    x = np.arange(1, len(curve) + 1)
    y = curve
    avg = np.mean(y)
    plt.plot(x, y, marker=markers[1], label=f'Balanced - Curve {i + 1} (Avg: {avg:.4f})', color=color)
    for xi, yi in zip(x, y):
        plt.text(xi, yi, f'{yi}', fontsize=9, ha='center', va='bottom', color=color)

# Plot settings
plt.title('Curves from Two 2D Numpy Arrays with Point Values and Averages', fontsize=14)
plt.xlabel('Number of iterations', fontsize=12)
plt.ylabel('Value', fontsize=12)
plt.legend(fontsize=10)
plt.grid(alpha=0.5)
plt.tight_layout()

save_path = 'C:\\Users\\zsc\\Desktop\\evolution computation\\meeting\\2024.12.31\\e-mosaic\\' + datasetname.split('.')[
    0] + '\\'
# 创建文件夹（如果不存在）
folder = os.path.dirname(save_path)
if not os.path.exists(folder):
    os.makedirs(folder)
# Show plot
plt.savefig(save_path + datasetname.split('.')[0] + f'init_gmean_mauc.jpg', dpi=300, bbox_inches='tight')


### 绘制init_acc折线图

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

# Example data
data1 = np.array(acc_random).T
data2 = np.array(acc_balanced).T

# Define markers and colors
markers = ['o', '^']  # 'o' for data1, '^' for data2
datasets = [data1, data2]
colors1 = plt.cm.viridis(np.linspace(0, 1, len(data1)))
colors2 = plt.cm.viridis(np.linspace(0, 1, len(data2)))

plt.figure(figsize=(12, 8))

# Plot Dataset 1
for i, (curve, color) in enumerate(zip(data1, colors1)):
    x = np.arange(1, len(curve) + 1)
    y = curve
    avg = np.mean(y)
    plt.plot(x, y, marker=markers[0], label=f'Random - Curve {i + 1} (Avg: {avg:.4f})', color=color)
    for xi, yi in zip(x, y):
        plt.text(xi, yi, f'{yi}', fontsize=9, ha='center', va='bottom', color=color)

# Plot Dataset 2
for i, (curve, color) in enumerate(zip(data2, colors2)):
    x = np.arange(1, len(curve) + 1)
    y = curve
    avg = np.mean(y)
    plt.plot(x, y, marker=markers[1], label=f'Balanced - Curve {i + 1} (Avg: {avg:.4f})', color=color)
    for xi, yi in zip(x, y):
        plt.text(xi, yi, f'{yi}', fontsize=9, ha='center', va='bottom', color=color)

# Plot settings
plt.title('Curves from Two 2D Numpy Arrays with Point Values and Averages', fontsize=14)
plt.xlabel('Number of iterations', fontsize=12)
plt.ylabel('Value', fontsize=12)
plt.legend(fontsize=10)
plt.grid(alpha=0.5)
plt.tight_layout()

save_path = 'C:\\Users\\zsc\\Desktop\\evolution computation\\meeting\\2024.12.31\\e-mosaic\\' + datasetname.split('.')[
    0] + '\\'
# 创建文件夹（如果不存在）
folder = os.path.dirname(save_path)
if not os.path.exists(folder):
    os.makedirs(folder)
# Show plot
plt.savefig(save_path + datasetname.split('.')[0] + f'init_acc.jpg', dpi=300, bbox_inches='tight')
