# E-SEIC
Selection of evolutionary instances with constraints for unbalanced datasets

In [3]:
import statistics
from utils.dataset_utils import get_distribution, k_fold_cross_validation
from instance_selection.parameter.parameter import *  # 导入参数的设定
from instance_selection.operator.init_toolbox import init_toolbox_eseic
from instance_selection.operator.metrics import calculate_accuracy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
import scipy.io as sio  # 从.mat文件中读取数据集
import random
import warnings
import numpy as np
import os
from openpyxl import Workbook

warnings.filterwarnings("ignore")  # 忽略警告
from utils.excel_utils import save_to_excel_2

# 数据的预处理
def data_process(dataset=None, distribution=False):
    datasetname = dataset.DATASETNAME.split('.')[0]
    mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + dataset.DATASETNAME)  # 加载、划分数据集
    x = mat_data['X']
    y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y,
                                                        random_state=RANDOM_SEED)  # 划分数据集
    scaler = StandardScaler()  # 数据的标准化
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
    unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
    unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
    weights_train = (1 / counts_train.astype(float)) / np.sum(1 / counts_train.astype(float))  # 计算每个类的权重，用于计算每个类别的权重
    if distribution:
        print(datasetname + f' distribution: {counts_all}')
        print(f'trainset distribution: {counts_train}')
        print(f'testset distribution: {counts_test}')
    model = MLPClassifier(hidden_layer_sizes=(dataset.HIDDEN_SIZE,), max_iter=dataset.MAX_ITER,
                          random_state=RANDOM_SEED, learning_rate_init=dataset.LEARNING_RATE)
    y_train_pred_proba = k_fold_cross_validation(model=clone(model), X=x_train, y=y_train, n_splits=N_SPLITS - 2,
                                                 method='soft',
                                                 random_state=RANDOM_SEED)  # 交叉验证得到软标签
    # 将概率转化为预测结果
    y_train_pred = np.argmax(y_train_pred_proba, axis=1)

    Acc1, Acc2, Acc3 = calculate_accuracy(y_train_pred, y_train, weights_train)
    constraints = [Acc1, Acc2, Acc3]

    return x_train, x_test, y_train, y_test, constraints, weights_train, clone(model)

def main(balanced_method='random'):
    ####################################种群的初始化###########################
    pop = toolbox.population(n=POPSIZE)  # 个体编码默认全为0
    pop = toolbox.init_population(pop, balanced_method=balanced_method)  # 初始化种群中的个体
    toolbox.evaluate(pop)  # 计算个体的适应度
    gmean_list = []
    mauc_list = []
    acc1_list = []
    acc2_list = []
    acc3_list = []
    for ind in pop:
        gmean_list.append(ind.gmean)
        mauc_list.append(ind.mauc)
        acc1_list.append(ind.fitness.values[0])
        acc2_list.append(ind.fitness.values[1])
        acc3_list.append(ind.fitness.values[2])
    # 求出list的中值
    median_gmean = statistics.median(gmean_list)
    median_mauc = statistics.median(mauc_list)
    median_acc1 = statistics.median(acc1_list)
    median_acc2 = statistics.median(acc2_list)
    median_acc3 = statistics.median(acc3_list)
    return median_gmean, median_mauc, median_acc1, median_acc2, median_acc3
def save_to_excel(data, save_path, filename='avg_results'):
    """
    将列表数据逐行写入Excel文件
    参数:
        data: 二维列表，每个子列表代表一行数据
        filename: 输出的Excel文件名(默认为output.xlsx)
    """
    # 创建一个新的工作簿
    wb = Workbook()
    # 获取活动的工作表
    ws = wb.active
    # 逐行写入数据
    for row in data:
        avg = row[1].tolist()
        avg.insert(0, row[0])
        std = row[2].tolist()
        std.insert(0, row[0])
        ws.append(avg)
        ws.append(std)
    # 创建 Excel 文件完整路径
    file_path = os.path.join(save_path, filename + ".xlsx")
    # 保存Excel文件
    wb.save(file_path)
    print(f"数据已成功写入到 {file_path}")

## 运行

In [4]:
DATASETS = [Balance_Scale,Dermatology,Ecoli,Car,Pen_Digits,WallRobot,German,Wine,Nursery,Penbased,USPS,Satellite,Page_Blocks,Shuttle,Contraceptive,Automobile,Ovarian]  # 数据集名称（包含对应的参数配置）
# DATASETS = [USPS, Satellite, Page_Blocks, Shuttle, Contraceptive, Automobile, Ovarian]  # 数据集名称（包含对应的参数配置）
if __name__ == "__main__":
    save_path = 'C:/Users/zsc/Desktop/Random_Balanced/Balanced/'
    
    columns = ['Gmean', 'MAUC', 'Acc1', 'Acc2', 'Acc3']
    datasets_median_results = [[] for _ in range(len(DATASETS))]
    print("*****************算法开始执行：******************")
    for j, dataset in enumerate(DATASETS):
        x_train, x_test, y_train, y_test, constraints, weights_train, model = data_process(dataset=dataset,
                                                                                           distribution=False)
        toolbox = init_toolbox_eseic(model, x_train, y_train, weights_train, constraints, n_splits=N_SPLITS,
                                     random_seed=42)  # 初始化toolbox
        num_run = 40  # 运行次数
        median_results = [[] for _ in range(num_run)]
        for i in range(num_run):
            median_gmean, median_mauc, median_acc1, median_acc2, median_acc3 = main(balanced_method='balanced')
            median_results[i] = [median_gmean, median_mauc, median_acc1, median_acc2, median_acc3]
            print(f"第{i + 1}次执行：Gmean：{median_gmean}，mAUC：{median_gmean}")
        save_to_excel_2(save_path + dataset.DATASETNAME.split('.')[0] + '/', dataset.DATASETNAME.split('.')[0], columns,
                        median_results)
        median_result_mean = np.mean(median_results, axis=0)
        # 计算ensembles_resultsz中每一列的标准差
        median_result_std = np.std(median_results, axis=0)
        print(f'中位数（平均值）：{median_result_mean}')
        print(f'集成分类结果（标准差）：{median_result_std}')
        datasets_median_results[j] = [dataset.DATASETNAME.split('.')[0], median_result_mean, median_result_std]
    print("*****************算法执行结束！******************")
    # 写入到Excel     
    save_to_excel(datasets_median_results, save_path)

*****************算法开始执行：******************
第1次执行：Gmean：0.868912，mAUC：0.868912
第2次执行：Gmean：0.8707685000000001，mAUC：0.8707685000000001
第3次执行：Gmean：0.869557，mAUC：0.869557
第4次执行：Gmean：0.8813885，mAUC：0.8813885
第5次执行：Gmean：0.8588830000000001，mAUC：0.8588830000000001
第6次执行：Gmean：0.868816，mAUC：0.868816
第7次执行：Gmean：0.8809995，mAUC：0.8809995
第8次执行：Gmean：0.88159，mAUC：0.88159
第9次执行：Gmean：0.880638，mAUC：0.880638
第10次执行：Gmean：0.86417，mAUC：0.86417
第11次执行：Gmean：0.859474，mAUC：0.859474
第12次执行：Gmean：0.870569，mAUC：0.870569
第13次执行：Gmean：0.870569，mAUC：0.870569
第14次执行：Gmean：0.87511，mAUC：0.87511
第15次执行：Gmean：0.876279，mAUC：0.876279
第16次执行：Gmean：0.8707685000000001，mAUC：0.8707685000000001
第17次执行：Gmean：0.8695865，mAUC：0.8695865
第18次执行：Gmean：0.870569，mAUC：0.870569
第19次执行：Gmean：0.8697715，mAUC：0.8697715
第20次执行：Gmean：0.875272，mAUC：0.875272
第21次执行：Gmean：0.8697715，mAUC：0.8697715
第22次执行：Gmean：0.864726，mAUC：0.864726
第23次执行：Gmean：0.8791355，mAUC：0.8791355
第24次执行：Gmean：0.875272，mAUC：0.875272
第25次执行：Gmean：0.869001，mAUC：0.869001
