##  不同的采样方式

In [None]:
from instance_selection.operator.metrics import calculate_gmean_mauc
from sklearn.neural_network import MLPClassifier
from utils.dataset_utils import get_distribution
from instance_selection.parameter.parameter import *  # 导入参数的设定
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import scipy.io as sio  # 从.mat文件中读取数据集
import random
import warnings

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Ecoli  # 数据集名称（包含对应参数的字典形式）
datasetname = DATASET['DATASETNAME'].split('.')[0]

# 加载、划分数据集
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET['DATASETNAME'])
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)  # 划分数据集
unique_elements_train, classes_train, counts_train = get_distribution(y_train)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test)  # 获取测试集分布

print(datasetname + 'distribution:')
print(f'trainset: {counts_train}')
print(f'testset: {counts_test}')

scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = MLPClassifier(hidden_layer_sizes=(DATASET['HIDDEN_SIZE'],), max_iter=DATASET['MAX_ITER'],
                      random_state=RANDOM_SEED, learning_rate_init=DATASET['LEARNING_RATE'])
model.fit(x_train, y_train)
y_test_pred_proba = model.predict_proba(x_test)
gmean, mauc, recall_per_class = calculate_gmean_mauc(y_test_pred_proba, y_test)
print(f'{datasetname} Gmean: {gmean}, mAUC: {mauc}')

### ROS 随机过采样

In [None]:
import numpy as np
from sklearn.base import clone
from imblearn.over_sampling import RandomOverSampler

num_run = 30  # 运行次数

ros_results = []
for i in range(num_run):
    ros = RandomOverSampler(random_state=42 + random.randint(1, 1000))
    x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)

    # unique_elements_ros, classes_ros, counts_ros = get_distribution(y_train_ros)  # 获取原始数据集分布
    # print(f'ros trainset: {counts_ros}')

    model_ros = clone(model)
    model_ros.fit(x_train_ros, y_train_ros)
    y_test_pred_proba_ros = model_ros.predict_proba(x_test)
    gmean_ros, mauc_ros, recall_per_class_ros = calculate_gmean_mauc(y_test_pred_proba_ros, y_test)
    print(f'第{i + 1}次运行: Gmean: {gmean_ros}, mAUC: {mauc_ros}')
    ros_results.append([gmean_ros, mauc_ros])

print(np.mean(ros_results, axis=0))

###  RUS 随机下采样

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from sklearn.base import clone

num_run = 30  # 运行次数

rus_results = []
for i in range(num_run):
    rus = RandomUnderSampler(random_state=42 + np.random.randint(1, 1000))
    x_train_rus, y_train_rus = rus.fit_resample(x_train, y_train)

    unique_elements_rus, classes_rus, counts_rus = get_distribution(y_train_rus)  # 获取原始数据集分布
    print(f'rus trainset: {counts_rus}')

    model_rus = clone(model)
    model_rus.fit(x_train_rus, y_train_rus)
    y_test_pred_proba_rus = model_rus.predict_proba(x_test)
    gmean_rus, mauc_rus, recall_per_class_rus = calculate_gmean_mauc(y_test_pred_proba_rus, y_test)
    print(f'第{i + 1}次运行: Gmean: {gmean_rus}, mAUC: {mauc_rus}')
    rus_results.append([gmean_rus, mauc_rus])

print(np.mean(rus_results, axis=0))

## 测试

In [10]:
from machine_learning.sampling.sample import sample_dataset
from sklearn.neural_network import MLPClassifier
from instance_selection.parameter.parameter import *  # 导入参数的设定
from sklearn.model_selection import train_test_split
import scipy.io as sio  # 从.mat文件中读取数据集
import numpy as np
import warnings
from sklearn.base import clone

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Ecoli  # 数据集名称（包含对应参数的字典形式）
datasetname = DATASET['DATASETNAME'].split('.')[0]

# 加载、划分数据集
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET['DATASETNAME'])
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]

num_run = 30  # 运行次数

nos_results = []
ros_results = []
rus_results = []
smote_results = []

model = MLPClassifier(hidden_layer_sizes=(DATASET['HIDDEN_SIZE'],), max_iter=DATASET['MAX_ITER'],
                      random_state=RANDOM_SEED, learning_rate_init=DATASET['LEARNING_RATE'])

for i in range(num_run):
    print(f'第{i + 1}次运行: *************************************')
    random_state = RANDOM_SEED + np.random.randint(1, 1000)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)  # 划分数据集
    
    gmean_ros, mauc_ros = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='ROS')
    gmean_rus, mauc_rus = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='RUS')
    gmean_nos, mauc_nos = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='NOS')
    gmean_smote, mauc_smote = sample_dataset(clone(model), x_train, x_test, y_train, y_test, random_state, method='SMOTE')
    
    print(f'gmean_ros: {gmean_ros}, mauc_ros: {mauc_ros}')
    print(f'gmean_rus: {gmean_rus}, mauc_rus: {mauc_rus}')
    print(f'gmean_nos: {gmean_nos}, mauc_nos: {mauc_nos}')
    print(f'gmean_smote: {gmean_smote}, mauc_smote: {mauc_smote}')
    nos_results.append([gmean_nos, mauc_nos])
    ros_results.append([gmean_ros, mauc_ros])
    rus_results.append([gmean_rus, mauc_rus])
    smote_results.append([gmean_smote, mauc_smote])

print(f'ROS: {np.mean(ros_results, axis=0)}')
print(f'RUS: {np.mean(rus_results, axis=0)}')
print(f'NOS: {np.mean(nos_results, axis=0)}')
print(f'SMOTE: {np.mean(smote_results, axis=0)}')


第1次运行: *************************************
gmean_ros: 0.85477, mauc_ros: 0.979833
gmean_rus: 0.67911, mauc_rus: 0.922694
gmean_nos: 0.840024, mauc_nos: 0.970611
gmean_smote: 0.81819, mauc_smote: 0.948889
第2次运行: *************************************
gmean_ros: 0.868564, mauc_ros: 0.97675
gmean_rus: 0.71933, mauc_rus: 0.949917
gmean_nos: 0.840024, mauc_nos: 0.970611
gmean_smote: 0.868564, mauc_smote: 0.96975
第3次运行: *************************************
gmean_ros: 0.887304, mauc_ros: 0.979583
gmean_rus: 0.812661, mauc_rus: 0.954444
gmean_nos: 0.840024, mauc_nos: 0.970611
gmean_smote: 0.852164, mauc_smote: 0.966889
第4次运行: *************************************
gmean_ros: 0.867847, mauc_ros: 0.976917
gmean_rus: 0.79747, mauc_rus: 0.958611
gmean_nos: 0.840024, mauc_nos: 0.970611
gmean_smote: 0.845506, mauc_smote: 0.957278
第5次运行: *************************************
gmean_ros: 0.730938, mauc_ros: 0.945083
gmean_rus: 0.712276, mauc_rus: 0.949694
gmean_nos: 0.840024, mauc_nos: 0.970611
gmean_