## 加载数据

In [1]:
import pandas as pd
from sklearn.base import clone
from config import Datasets, Datasets_2
from feature_selection import FeatureSelection, non_dominated_sort, train_and_test
from skfeature.function.similarity_based import fisher_score
from skfeature.function.information_theoretical_based import CIFE
from skfeature.function.statistical_based import chi_square
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings("ignore")  # 忽略警告

# 数据集
fs = FeatureSelection(Datasets_2)
# 创建带有列名的空DataFrame
df = pd.DataFrame(columns=['数据集', '类分布', '特征数量', '特征选择数量', '原始', 'SMOTE', '特征选择+SMOTE'])
for i in range(len(Datasets_2)):
    # 数据预处理
    fs.pre_process(Datasets_2[i], random_state=42)
    print(f"{i + 1}th dataset: {fs.dataset.DATASETNAME}")
    fs.display_distribution()

    # 特征选择（三种不同的选择算法）
    idx_1 = fs.feature_selection(fisher_score.fisher_score, mode='index')
    # print(idx_1)
    idx_2 = fs.feature_selection(chi_square.chi_square, mode='index')
    # print(idx_2)
    idx_3 = fs.feature_selection(CIFE.cife, mode='index', n_selected_features=fs.x_train.shape[1])
    # print(idx_3)

    # 非支配排序（三种算法的特征排名）
    all_fronts = non_dominated_sort(idx_1, idx_2, idx_3)

    # 前后结果对比（原始数据、SMOTE、特征选择+SMOTE）
    model = MLPClassifier(hidden_layer_sizes=(fs.dataset.HIDDEN_SIZE,), max_iter=fs.dataset.MAX_ITER,
                          random_state=42, learning_rate_init=fs.dataset.LEARNING_RATE)
    res_1 = train_and_test(clone(model), fs.x_train, fs.x_test, fs.y_train, fs.y_test)
    print(f"原始：{res_1}")
    x_train, y_train = SMOTE(random_state=42, k_neighbors=fs.dataset.K_NEIGHBORS).fit_resample(fs.x_train, fs.y_train)
    res_2 = train_and_test(clone(model), x_train, fs.x_test, y_train, fs.y_test)
    print(f"SMOTE：{res_2}")
    x_train, y_train = SMOTE(random_state=42, k_neighbors=fs.dataset.K_NEIGHBORS).fit_resample(
        fs.x_train[:, all_fronts[0]], fs.y_train)
    res_3 = train_and_test(clone(model), x_train, fs.x_test[:, all_fronts[0]], y_train, fs.y_test)
    print(f"特征选择+SMOTE：{res_3}")
    # 保存结果
    df.loc[i] = [fs.dataset.DATASETNAME, fs.distribution, fs.x_train.shape[1],
                 len(all_fronts[0]), res_1, res_2, res_3]
#  保存结果
df.to_csv('feature_selection_result.csv', index=False)

1th dataset: Armstrong-2002-v1.mat
trainset distribution: [17 33]
testset distribution: [ 7 15]
number of feature: 1081
原始：(0.966092, 1.0, 0.949425)
SMOTE：(1.0, 1.0, 1.0)
特征选择+SMOTE：(0.730297, 0.87619, 0.770833)
2th dataset: Gordon-2002.mat
trainset distribution: [ 22 104]
testset distribution: [ 9 46]
number of feature: 1626
原始：(0.989071, 1.0, 0.96819)
SMOTE：(1.0, 1.0, 1.0)
特征选择+SMOTE：(1.0, 1.0, 1.0)
3th dataset: Colon.mat
trainset distribution: [15 28]
testset distribution: [ 7 12]
number of feature: 2000
原始：(0.886405, 0.916667, 0.886905)
SMOTE：(0.886405, 0.916667, 0.886905)
特征选择+SMOTE：(0.763763, 0.880952, 0.736842)
4th dataset: Yeoh-2002-v1.mat
trainset distribution: [143  30]
testset distribution: [62 13]
number of feature: 2526
原始：(0.889698, 0.952854, 0.868267)
SMOTE：(0.825313, 0.954094, 0.871619)
特征选择+SMOTE：(0.897335, 0.98139, 0.887082)
5th dataset: DLBCL.mat
trainset distribution: [40 13]
testset distribution: [18  6]
number of feature: 5469


ValueError: Input X must be non-negative.