# 不同的采样方式

In [9]:
from instance_selection.operator.metrics import calculate_gmean_mauc
from sklearn.neural_network import MLPClassifier
from utils.dataset_utils import get_distribution
from instance_selection.parameter.parameter import *  # 导入参数的设定
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import scipy.io as sio  # 从.mat文件中读取数据集
import random
import warnings

warnings.filterwarnings("ignore")  # 忽略警告

DATASET = Dermatology  # 数据集名称（包含对应参数的字典形式）
datasetname = DATASET['DATASETNAME'].split('.')[0]

# 加载、划分数据集
mat_data = sio.loadmat(IMBALANCED_DATASET_PATH + DATASET['DATASETNAME'])
x = mat_data['X']
y = mat_data['Y'][:, 0]  # mat_data['Y']得到的形状为[n,1]，通过[:,0]，得到形状[n,]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)  # 划分数据集
scaler = StandardScaler()  # 数据的标准化
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
model = MLPClassifier(hidden_layer_sizes=(DATASET['HIDDEN_SIZE'],), max_iter=DATASET['MAX_ITER'],
                      random_state=RANDOM_SEED, learning_rate_init=DATASET['LEARNING_RATE'])

model.fit(x_train, y_train)
y_test_pred_proba = model.predict_proba(x_test)
gmean, mauc, recall_per_class = calculate_gmean_mauc(y_test_pred_proba, y_test)
print(f'{datasetname} Gmean: {gmean}, mAUC: {mauc}')

Dermatology Gmean: 0.905031, mAUC: 0.97987


## ROS 随机过采样

In [15]:
from sklearn.base import clone
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(x, y)
x_train_ros, x_test_ros, y_train_ros, y_test_ros = train_test_split(x_ros, y_ros, test_size=0.2,
                                                                    random_state=RANDOM_SEED)  # 划分数据集
scaler = StandardScaler()  # 数据的标准化
x_train_ros = scaler.fit_transform(x_train_ros)
x_test_ros = scaler.transform(x_test_ros)
unique_elements_all, classes_all, counts_all = get_distribution(y)  # 获取原始数据集分布
unique_elements_ros, classes_ros, counts_ros = get_distribution(y_ros)  # 获取原始数据集分布
unique_elements_train, classes_train, counts_train = get_distribution(y_train_ros)  # 获取训练集分布
unique_elements_test, classes_test, counts_test = get_distribution(y_test_ros)  # 获取测试集分布

print(datasetname + 'distribution:')
print(f'source dataset: {counts_all}')
print(f'ros dataset: {counts_ros}')
print(f'trainset: {counts_train}')
print(f'testset: {counts_test}')

model_ros = clone(model)
model_ros.fit(x_train_ros, y_train_ros)
y_test_pred_proba_ros = model_ros.predict_proba(x_test_ros)
gmean_ros, mauc_ros, recall_per_class_ros = calculate_gmean_mauc(y_test_pred_proba_ros, y_test_ros)
print(f'{datasetname} Gmean: {gmean_ros}, mAUC: {mauc_ros}')

Dermatologydistribution:
source dataset: [112  61  72  49  52  20]
ros dataset: [112 112 112 112 112 112]
trainset: [91 92 90 85 92 87]
testset: [21 20 22 27 20 25]
Dermatology Gmean: 0.951932, mAUC: 0.992985
