# SelfPacedEnsembleClassifier

In [12]:
from sklearn.neural_network import MLPClassifier
from metrics.metrics import calculate_gmean_mauc
from sklearn.model_selection import train_test_split
from utils.dataset_utils import get_distribution
from imbens.ensemble import SelfPacedEnsembleClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from instance_selection.parameter.parameter import *  # 导入参数的设定
import scipy.io as sio  # 从.mat文件中读取数据集
from sklearn.base import clone
import warnings

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Nursery  # 数据集名称（包含对应的参数配置）
datasetname = DATASET.DATASETNAME.split('.')[0]
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET.DATASETNAME)  # 加载、划分数据集
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
model = MLPClassifier(hidden_layer_sizes=(DATASET.HIDDEN_SIZE,), max_iter=DATASET.MAX_ITER,
                      random_state=RANDOM_SEED, learning_rate_init=DATASET.LEARNING_RATE)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8, random_state=42)  # 划分数据集

# 输出数据集分布
unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布
print(datasetname + f' distribution: {counts_all}')
print(f'trainset distribution: {counts_train}')
print(f'testset distribution: {counts_test}')

clf = SelfPacedEnsembleClassifier(estimator=clone(model), random_state=0, n_estimators=30)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
y_pred_proba = clf.predict_proba(x_test)
gmean, mauc, recall_per_class = calculate_gmean_mauc(y_pred_proba, y_test)
# 输出gmean, mauc, recall_per_class
print(f'gmean: {gmean:.4f}, mauc: {mauc:.4f}, recall_per_class: {recall_per_class}')
# 输出准确率
accuracy = accuracy_score(y_test, y_pred)
print(f'模型准确率: {accuracy:.4f}')

# 输出混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\n混淆矩阵：\n", cm)

# 输出分类报告
report = classification_report(y_test, y_pred)
print("\n分类报告：\n", report)

Nursery distribution: [4320 4266 4044  328]
trainset distribution: [857 873 806  55]
testset distribution: [3463 3393 3238  273]
gmean: 0.8172, mauc: 0.9649, recall_per_class: [1.         0.51311524 0.8721433  0.996337  ]
模型准确率: 0.8006

混淆矩阵：
 [[3463    0    0    0]
 [   0 1741  855  797]
 [   0  361 2824   53]
 [   0    1    0  272]]

分类报告：
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3463
           1       0.83      0.51      0.63      3393
           2       0.77      0.87      0.82      3238
           3       0.24      1.00      0.39       273

    accuracy                           0.80     10367
   macro avg       0.71      0.85      0.71     10367
weighted avg       0.85      0.80      0.81     10367
