In [5]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from metrics.metrics import calculate_gmean_mauc
from sklearn.model_selection import train_test_split
from utils.dataset_utils import get_distribution
from imbens.ensemble import SelfPacedEnsembleClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from instance_selection.parameter.parameter import *  # 导入参数的设定
import scipy.io as sio  # 从.mat文件中读取数据集
from sklearn.base import clone
import warnings
from imblearn.ensemble import EasyEnsembleClassifier
warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Satellite  # 数据集名称（包含对应的参数配置）
datasetname = DATASET.DATASETNAME.split('.')[0]
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET.DATASETNAME)  # 加载、划分数据集
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
model = MLPClassifier(hidden_layer_sizes=(DATASET.HIDDEN_SIZE,), max_iter=DATASET.MAX_ITER,
                      random_state=RANDOM_SEED, learning_rate_init=DATASET.LEARNING_RATE)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)  # 划分数据集

# 输出数据集分布
unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
print(datasetname + f' distribution: {counts_all}')
print(f'trainset distribution: {counts_train}')
print(f'testset distribution: {counts_test}')
num_run = 30
gmean_results = []
mauc_results = []
results = []
eec = EasyEnsembleClassifier(random_state=42, n_estimators=30)
eec.fit(x_train, y_train)
y_pred_proba = eec.predict_proba(x_test)
gmean, mauc, recall_per_class = calculate_gmean_mauc(y_pred_proba, y_test)
print(f'gmean: {gmean:.4f}, mauc: {mauc:.4f}, recall_per_class: {recall_per_class}')
# for i in range(0, num_run):
#     clf = SelfPacedEnsembleClassifier(estimator=clone(model), random_state=np.random.randint(1, 10000), n_estimators=30)
#     clf.fit(x_train, y_train)
#     y_pred = clf.predict(x_test)
#     y_pred_proba = clf.predict_proba(x_test)
#     gmean, mauc, recall_per_class = calculate_gmean_mauc(y_pred_proba, y_test)
#     results.append([gmean, mauc])
#     gmean_results.append([gmean])
#     mauc_results.append([mauc])
#     print(f'第{i + 1}次运行:gmean: {gmean:.4f}, mauc: {mauc:.4f}, recall_per_class: {recall_per_class}')
# print(f'平均值{np.mean(results, axis=0)}')


Satellite distribution: [1533  703 1358  626  707 1508]
trainset distribution: [1073  492  951  438  495 1055]
testset distribution: [460 211 407 188 212 453]
gmean: 0.7818, mauc: 0.9349, recall_per_class: [0.84347826 0.87203791 0.90663391 0.56914894 0.82075472 0.73289183]
