In [2]:
from utils.dataset_utils import get_distribution, k_fold_cross_validation
from instance_selection.parameter.parameter import *  # 导入参数的设定
from instance_selection.operator.metrics import calculate_gmean_mauc, calculate_accuracy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
import scipy.io as sio  # 从.mat文件中读取数据集
import warnings
import numpy as np

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Yeast  # 数据集名称（包含对应参数的字典形式）
datasetname = DATASET['DATASETNAME'].split('.')[0]
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET['DATASETNAME'])  # 加载、划分数据集
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)  # 划分数据集
scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
print(datasetname + f' distribution: {counts_all}')
print(f'trainset distribution: {counts_train}')
print(f'testset distribution: {counts_test}')

model = MLPClassifier(hidden_layer_sizes=(DATASET['HIDDEN_SIZE'],), max_iter=DATASET['MAX_ITER'],
                      random_state=RANDOM_SEED, learning_rate_init=DATASET['LEARNING_RATE'])

weights_train = (1 / counts_train.astype(float)) / np.sum(1 / counts_train.astype(float))  # 计算每个类的权重，用于计算每个类别的权重

num_instances = int(np.ceil(counts_train.min() * 0.9))  # 取最小数量的类的0.9（向下取整）
print("最小数量:", num_instances)

y_train_pred_proba = k_fold_cross_validation(model=clone(model), X=x_train, y=y_train, n_splits=N_SPLITS, method='soft',
                                             random_state=RANDOM_SEED)  # 交叉验证得到软标签
# 将概率转化为预测结果
y_train_pred = np.argmax(y_train_pred_proba, axis=1)

Acc1, Acc2, Acc3 = calculate_accuracy(y_train_pred, y_train, weights_train)
constraints = [Acc1, Acc2, Acc3]

model.fit(x_train, y_train)
y_test_pred_proba = model.predict_proba(x_test)
print(calculate_gmean_mauc(y_test_pred_proba, y_test))

Yeast distribution: [463  35  44  51 163 243 429  20  30]
trainset distribution: [362  26  36  43 136 193 348  15  23]
testset distribution: [101   9   8   8  27  50  81   5   7]
最小数量: 14
(0.0, 0.883151, array([0.44554455, 0.66666667, 1.        , 0.5       , 0.92592593,
       0.66      , 0.67901235, 0.6       , 0.        ]))
