##  不同的采样方式

In [2]:
from instance_selection.operator.metrics import calculate_gmean_mauc
from sklearn.neural_network import MLPClassifier
from utils.dataset_utils import get_distribution
from instance_selection.parameter.parameter import *  # 导入参数的设定
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import scipy.io as sio  # 从.mat文件中读取数据集
import random
import warnings

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = German  # 数据集名称（包含对应参数的字典形式）
datasetname = DATASET['DATASETNAME'].split('.')[0]

# 加载、划分数据集
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET['DATASETNAME'])
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)  # 划分数据集
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布

print(datasetname + 'distribution:')
print(f'trainset: {counts_train}')
print(f'testset: {counts_test}')

scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = MLPClassifier(hidden_layer_sizes=(DATASET['HIDDEN_SIZE'],), max_iter=DATASET['MAX_ITER'],
                      random_state=RANDOM_SEED, learning_rate_init=DATASET['LEARNING_RATE'])
model.fit(x_train, y_train)
y_test_pred_proba = model.predict_proba(x_test)
gmean, mauc, recall_per_class = calculate_gmean_mauc(y_test_pred_proba, y_test)
print(f'{datasetname} Gmean: {gmean}, mAUC: {mauc}')

Germandistribution:
trainset: [563 237]
testset: [137  63]
German Gmean: 0.633609, mAUC: 0.713591


### ROS 随机过采样

In [3]:
import numpy as np
from sklearn.base import clone
from imblearn.over_sampling import RandomOverSampler

num_run = 30  # 运行次数

ros_results = []
for i in range(num_run):
    ros = RandomOverSampler(random_state=42 + random.randint(1, 1000))
    x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)

    # unique_elements_ros, classes_ros, counts_ros = get_distribution(y_train_ros)  # 获取原始数据集分布
    # print(f'ros trainset: {counts_ros}')

    model_ros = clone(model)
    model_ros.fit(x_train_ros, y_train_ros)
    y_test_pred_proba_ros = model_ros.predict_proba(x_test)
    gmean_ros, mauc_ros, recall_per_class_ros = calculate_gmean_mauc(y_test_pred_proba_ros, y_test)
    print(f'第{i + 1}次运行: Gmean: {gmean_ros}, mAUC: {mauc_ros}')
    ros_results.append([gmean_ros, mauc_ros])

print(np.mean(ros_results, axis=0))

第1次运行: Gmean: 0.642596, mAUC: 0.6954
第2次运行: Gmean: 0.695917, mAUC: 0.715676
第3次运行: Gmean: 0.644846, mAUC: 0.722164
第4次运行: Gmean: 0.608231, mAUC: 0.683814
第5次运行: Gmean: 0.624306, mAUC: 0.721237
第6次运行: Gmean: 0.67341, mAUC: 0.713591
第7次运行: Gmean: 0.635708, mAUC: 0.687638
第8次运行: Gmean: 0.648251, mAUC: 0.715676
第9次运行: Gmean: 0.617026, mAUC: 0.705249
第10次运行: Gmean: 0.61618, mAUC: 0.70884
第11次运行: Gmean: 0.646192, mAUC: 0.74441
第12次运行: Gmean: 0.661257, mAUC: 0.742903
第13次运行: Gmean: 0.676928, mAUC: 0.741166
第14次运行: Gmean: 0.710417, mAUC: 0.753563
第15次运行: Gmean: 0.695917, mAUC: 0.748001
第16次运行: Gmean: 0.692579, mAUC: 0.756807
第17次运行: Gmean: 0.670911, mAUC: 0.726915
第18次运行: Gmean: 0.695584, mAUC: 0.759472
第19次运行: Gmean: 0.652526, mAUC: 0.743135
第20次运行: Gmean: 0.672205, mAUC: 0.757154
第21次运行: Gmean: 0.635708, mAUC: 0.694473
第22次运行: Gmean: 0.682214, mAUC: 0.735141
第23次运行: Gmean: 0.658624, mAUC: 0.728189
第24次运行: Gmean: 0.582545, mAUC: 0.664929
第25次运行: Gmean: 0.680428, mAUC: 0.725408
第26次运行: Gmean: 

###  RUS 随机下采样

In [4]:
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from sklearn.base import clone

num_run = 30  # 运行次数

rus_results = []
for i in range(num_run):
    rus = RandomUnderSampler(random_state=42 + np.random.randint(1, 1000))
    x_train_rus, y_train_rus = rus.fit_resample(x_train, y_train)

    unique_elements_rus, classes_rus, counts_rus = get_distribution(y_train_rus)  # 获取原始数据集分布
    print(f'rus trainset: {counts_rus}')

    model_rus = clone(model)
    model_rus.fit(x_train_rus, y_train_rus)
    y_test_pred_proba_rus = model_rus.predict_proba(x_test)
    gmean_rus, mauc_rus, recall_per_class_rus = calculate_gmean_mauc(y_test_pred_proba_rus, y_test)
    print(f'第{i + 1}次运行: Gmean: {gmean_rus}, mAUC: {mauc_rus}')
    rus_results.append([gmean_rus, mauc_rus])

print(np.mean(rus_results, axis=0))

rus trainset: [237 237]
第1次运行: Gmean: 0.730043, mAUC: 0.790059
rus trainset: [237 237]
第2次运行: Gmean: 0.658096, mAUC: 0.722975
rus trainset: [237 237]
第3次运行: Gmean: 0.677356, mAUC: 0.764801
rus trainset: [237 237]
第4次运行: Gmean: 0.655891, mAUC: 0.735488
rus trainset: [237 237]
第5次运行: Gmean: 0.688553, mAUC: 0.74244
rus trainset: [237 237]
第6次运行: Gmean: 0.669095, mAUC: 0.744873
rus trainset: [237 237]
第7次运行: Gmean: 0.653857, mAUC: 0.706871
rus trainset: [237 237]
第8次运行: Gmean: 0.690569, mAUC: 0.756807
rus trainset: [237 237]
第9次运行: Gmean: 0.663531, mAUC: 0.732128
rus trainset: [237 237]
第10次运行: Gmean: 0.713996, mAUC: 0.78948
rus trainset: [237 237]
第11次运行: Gmean: 0.599502, mAUC: 0.707334
rus trainset: [237 237]
第12次运行: Gmean: 0.661082, mAUC: 0.710346
rus trainset: [237 237]
第13次运行: Gmean: 0.707475, mAUC: 0.783339
rus trainset: [237 237]
第14次运行: Gmean: 0.6753, mAUC: 0.719731
rus trainset: [237 237]
第15次运行: Gmean: 0.642236, mAUC: 0.692388
rus trainset: [237 237]
第16次运行: Gmean: 0.718445, mAUC

## 测试

In [5]:
from machine_learning.sampling.sample import sample_dataset
from sklearn.neural_network import MLPClassifier
from instance_selection.parameter.parameter import *  # 导入参数的设定
from sklearn.model_selection import train_test_split
import scipy.io as sio  # 从.mat文件中读取数据集
import numpy as np
import warnings
from sklearn.base import clone

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = German  # 数据集名称（包含对应参数的字典形式）
datasetname = DATASET['DATASETNAME'].split('.')[0]

# 加载、划分数据集
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET['DATASETNAME'])
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]

num_run = 30  # 运行次数

nos_results = []
ros_results = []
rus_results = []
smote_results = []

model = MLPClassifier(hidden_layer_sizes=(DATASET['HIDDEN_SIZE'],), max_iter=DATASET['MAX_ITER'],
                      random_state=RANDOM_SEED, learning_rate_init=DATASET['LEARNING_RATE'])

for i in range(num_run):
    print(f'第{i + 1}次运行: *************************************')
    random_state = RANDOM_SEED + np.random.randint(1, 1000)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)  # 划分数据集
    
    gmean_ros, mauc_ros = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='ROS')
    gmean_rus, mauc_rus = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='RUS')
    gmean_nos, mauc_nos = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='NOS')
    gmean_smote, mauc_smote = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='SMOTE')
    
    print(f'gmean_ros: {gmean_ros}, mauc_ros: {mauc_ros}')
    print(f'gmean_rus: {gmean_rus}, mauc_rus: {mauc_rus}')
    print(f'gmean_nos: {gmean_nos}, mauc_nos: {mauc_nos}')
    print(f'gmean_smote: {gmean_smote}, mauc_smote: {mauc_smote}')
    nos_results.append([gmean_nos, mauc_nos])
    ros_results.append([gmean_ros, mauc_ros])
    rus_results.append([gmean_rus, mauc_rus])
    smote_results.append([gmean_smote, mauc_smote])

print(f'ROS: {np.mean(ros_results, axis=0)}')
print(f'RUS: {np.mean(rus_results, axis=0)}')
print(f'NOS: {np.mean(nos_results, axis=0)}')
print(f'SMOTE: {np.mean(smote_results, axis=0)}')


第1次运行: *************************************
gmean_ros: 0.673066, mauc_ros: 0.754837
gmean_rus: 0.657832, mauc_rus: 0.703742
gmean_nos: 0.633609, mauc_nos: 0.713591
gmean_smote: 0.482578, mauc_smote: 0.819372
第2次运行: *************************************
gmean_ros: 0.66684, mauc_ros: 0.761325
gmean_rus: 0.612692, mauc_rus: 0.671069
gmean_nos: 0.633609, mauc_nos: 0.713591
gmean_smote: 0.659239, mauc_smote: 0.803151
第3次运行: *************************************
gmean_ros: 0.690905, mauc_ros: 0.752868
gmean_rus: 0.642867, mauc_rus: 0.71081
gmean_nos: 0.633609, mauc_nos: 0.713591
gmean_smote: 0.659239, mauc_smote: 0.795852
第4次运行: *************************************
gmean_ros: 0.645565, mauc_ros: 0.721122
gmean_rus: 0.669615, mauc_rus: 0.709651
gmean_nos: 0.633609, mauc_nos: 0.713591
gmean_smote: 0.708457, mauc_smote: 0.792608
第5次运行: *************************************
gmean_ros: 0.676329, mauc_ros: 0.76063
gmean_rus: 0.647536, mauc_rus: 0.754142
gmean_nos: 0.633609, mauc_nos: 0.713591
gm