## 加载数据

In [1]:
from feature_selection import FeatureSelection


def convert_ranking(arr):
    n = len(arr)
    arr_revers = np.zeros(n, dtype=int)
    for rank in range(n):
        feature_index = arr[rank]
        arr_revers[feature_index] = rank  # 假设排名从0开始
    return arr_revers


# 数据集参数
Gisette = {'DATASETNAME': 'gisette.mat'}
Isolet = {'DATASETNAME': 'Isolet.mat'}
Madelon = {'DATASETNAME': 'madelon.mat'}
Wine = {'DATASETNAME': 'Wine.mat'}
GLIOMA = {'DATASETNAME': 'GLIOMA.mat'}
Lung = {'DATASETNAME': 'lung.mat'}
Datasets = [Gisette, Isolet, Madelon, Wine, GLIOMA, Lung]

fs = FeatureSelection(Datasets)
fs.pre_process(Datasets[2], random_state=42)
fs.display_distribution()

trainset distribution: [910 910]
testset distribution: [390 390]
number of feature: 500


## 特征选择
- 应用不同的特征选择算法
- 将选择的特征转换成索引

In [2]:
import numpy as np
from skfeature.function.similarity_based import fisher_score
from skfeature.function.information_theoretical_based import CIFE
from skfeature.function.statistical_based import chi_square

idx_1 = fisher_score.fisher_score(fs.x_train, fs.y_train, mode='index')
idx_1_convert = convert_ranking(idx_1)
# print(idx_1)
print(idx_1_convert)

idx_2 = chi_square.chi_square(fs.x_train, fs.y_train, mode='index')
idx_2_convert = convert_ranking(idx_2)
# print(idx_2)
print(idx_2_convert)

idx_3 = CIFE.cife(fs.x_train, fs.y_train, mode='index', n_selected_features=fs.x_train.shape[1])
idx_3_convert = convert_ranking(idx_3)
# print(idx_3)
print(idx_3_convert)

[308 481 445 441  42 125 406  30 397  44 107 211  82 236 190 100 356 428
  28 438 410 339 497 130 229 117 178 135 244 409 120 309 126 218 237 439
 434 102 393 350 492 326  54 111  33  69  70  65   9 104 382 462 149 202
 232  20 465 495 304  66 260  88 251  52   2  98 349 267 482 279 433 389
 381  47 235 138 184 391 494 335 294 377 347 346 287  63 199  78  64  41
 415 291 374 289 241 227 425  57 329 448 369 323 134 152 413   5 284 170
 322 344 365 351 165 419 436 472 112 123 276  60  96  39 455 405 477  45
  87  48   4  89 115 491 300 132 318 271  83  23 234 450 286 363 296 265
 242 486 258 386 417 416 319 167  80 261 182 263 143 355 195 321 395 161
 257 212  27 288 471 148 228 443 133 312 200 154 264 144 298  74  73 343
 169  85 139 205 293 172 281 129 201 352 476 240 277  62 466 186  31 175
 490  43 316 370 137 250  34  32 468 192 461 270 113  77 367 328 223 157
 101 119 220 255 340 188 475 210 320 488  22 198 467 469 280 206 463 396
 353 401  26 313 479 249 493   1 334  37 303 173 10

## 非支配排序

In [3]:
def non_dominated_sort(arr1, arr2, arr3):
    """
    对三个特征排名数组进行非支配排序，返回所有前沿
    参数:
        arr1, arr2, arr3: 一维ndarray数组，表示三个不同的特征排名 
    返回:
        list of lists: 每个子列表代表一个前沿面，按Front 1, Front 2, ...排列
    """
    # 1. 合并三个数组的排名
    num_features = len(arr1)
    features = np.column_stack((arr1, arr2, arr3))
    # 2. 非支配排序
    domination_counts = np.zeros(num_features, dtype=int)
    dominated_features = [[] for _ in range(num_features)]
    fronts = [[]]  # fronts[0] = Front 1, fronts[1] = Front 2, ...
    # 计算支配关系
    for i in range(num_features):
        for j in range(i + 1, num_features):
            # 检查i是否支配j
            if np.all(features[i] <= features[j]) and np.any(features[i] < features[j]):
                dominated_features[i].append(j)
                domination_counts[j] += 1
    # 初始化第一前沿面（Front 1）
    fronts[0] = [i for i in range(num_features) if domination_counts[i] == 0]
    # 构建后续前沿面
    current_front = 0
    while current_front < len(fronts) and fronts[current_front]:
        next_front = []
        for i in fronts[current_front]:
            for j in dominated_features[i]:
                domination_counts[j] -= 1
                if domination_counts[j] == 0:
                    next_front.append(j)
        current_front += 1
        if next_front:
            fronts.append(next_front)
    # 3. 返回所有前沿（每个前沿内的特征按原始索引升序排列）
    return [sorted(front) for front in fronts]
# 示例用法
# if __name__ == "__main__":
#     # 示例数据
#     arr1 = np.array([1, 3, 2, 5, 4])  # 算法1的排名（越小越好）
#     arr2 = np.array([2, 1, 3, 4, 5])  # 算法2的排名
#     arr3 = np.array([1, 2, 4, 3, 5])  # 算法3的排名
#     # 0、1、
#     all_fronts = non_dominated_sort(arr1, arr2, arr3)
#     print("所有Pareto前沿：")
#     for i, front in enumerate(all_fronts, 1):
#         print(f"Front {i}: {front}")
all_fronts = non_dominated_sort(idx_1_convert, idx_2_convert, idx_3_convert)
print(f"Front 1: {all_fronts[0]}")

Front 1: [0, 1, 2, 3, 4, 6, 7, 12, 18, 21, 44, 46, 47, 48, 64, 85, 105, 110, 125, 164, 171, 241, 281, 284, 286, 300, 336, 349, 378, 403, 442, 472, 475, 493]


## 前后对比

In [8]:
from sklearn.base import clone
from sklearn.neural_network import MLPClassifier
from metrics import calculate_gmean_mauc_f1
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")  # 忽略警告

def train_and_test(model, x_train, x_test, y_train, y_test):
    scaler = StandardScaler()  # 数据的标准化
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    model.fit(x_train, y_train)  # 模型训练
    y_test_pred_proba = model.predict_proba(x_test)
    gmean, mauc, f1 = calculate_gmean_mauc_f1(y_test_pred_proba, y_test)  # 计算准确率指标
    return gmean, mauc, f1

In [9]:
model = MLPClassifier(hidden_layer_sizes=(15,), max_iter=100, random_state=42)
gmean, mauc, f1 = train_and_test(clone(model), fs.x_train, fs.x_test, fs.y_train, fs.y_test)
print(f"gmean: {gmean}, mauc: {mauc}, f1: {f1}")

gmean: 0.526235, mauc: 0.562262, f1: 0.52658


In [10]:
x_train, y_train = SMOTE(random_state=42, k_neighbors=4).fit_resample(fs.x_train, fs.y_train)
gmean, mauc, f1 = train_and_test(clone(model), x_train, fs.x_test, y_train, fs.y_test)
print(f"gmean: {gmean}, mauc: {mauc}, f1: {f1}")

gmean: 0.526235, mauc: 0.562262, f1: 0.52658


In [11]:
x_train, y_train = SMOTE(random_state=42, k_neighbors=4).fit_resample(fs.x_train[:, all_fronts[0]], fs.y_train)
gmean, mauc, f1 = train_and_test(clone(model), x_train, fs.x_test[:, all_fronts[0]], y_train, fs.y_test)
print(f"gmean: {gmean}, mauc: {mauc}, f1: {f1}")

gmean: 0.646149, mauc: 0.696029, f1: 0.646152
