### 构造小样本数据集

In [5]:
import pandas as pd
import numpy as np

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split

seed = 42

data = datasets.load_breast_cancer()
df = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df[data['feature_names']], df['target'], test_size=0.3, random_state=seed)

In [7]:
x_few_train, _, y_few_train, _ = train_test_split(x_train, y_train, train_size=50, random_state=seed)

### 获取baseline

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, \
    recall_score, average_precision_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


import xgboost as xgb
def tabular_model_test(x_train, y_train, x_test, y_test,
                       model_name='logistic_regression',
                       sample_weight=None, model_params=None):
    """适用于表格结构化数据二分类任务"""
    # 训练模型
    # sample_weight = sample_weight if sample_weight is not None else None
    if model_name == 'xgb':
        xgb_params = {
            'objective': 'binary:logistic',
            'seed': seed
        }
        if model_params is not None:
            xgb_params.update(model_params)
        model = xgb.XGBClassifier(**xgb_params)
        model.fit(x_train, y_train, eval_metric=['auc'], sample_weight=sample_weight)
    elif model_name == 'logistic_regression':
        model = LogisticRegression(random_state=seed)
        model.fit(x_train, y_train, sample_weight=sample_weight)
    elif model_name == 'rf':
        model = RandomForestClassifier(random_state=seed)
        model.fit(x_train, y_train)
    elif model_name == 'decision_tree':
        model = DecisionTreeClassifier(random_state=seed)
        model.fit(x_train, y_train, sample_weight=sample_weight)
    elif model_name == 'adaboost':
        model = AdaBoostClassifier(random_state=seed)
        model.fit(x_train, y_train, sample_weight=sample_weight)
    elif model_name == "knn":
        model = KNeighborsClassifier()
        model.fit(x_train, y_train)
    elif model_name == 'mlp':
        model = MLPClassifier(random_state=seed, early_stopping=True, hidden_layer_sizes=128)
        model.fit(x_train, y_train)
    elif model_name == 'svm':
        model = svm.SVC(gamma='scale', C=1.0, decision_function_shape='ovr', kernel='rbf')
        model.fit(x_train, y_train)
    else:
        # default model
        model = LogisticRegression()
        model.fit(x_train, y_train, sample_weight=sample_weight)

    # 预测
    test_pred = model.predict(x_test)

    # 将概率转换为类别
    test_pred_label = [1 if p >= 0.5 else 0 for p in test_pred]

    # 计算准确率和AUC
    test_accuracy = accuracy_score(y_test, test_pred_label)
    test_precision = precision_score(y_test, test_pred_label)
    test_f1 = f1_score(y_test, test_pred_label)
    test_auc = roc_auc_score(y_test, test_pred)
    test_recall = recall_score(y_test, test_pred_label)
    test_ap = average_precision_score(y_test, test_pred_label)
    # 计算混淆矩阵
    test_confusion_matrix = confusion_matrix(y_test, test_pred_label)

    # print(f'Train Accuracy: {train_accuracy}')
    print(f'Test Accuracy: {test_accuracy}')
    print(f'Test precision: {test_precision}')
    print(f'Test F1: {test_f1}')
    print(f'Test AUC: {test_auc}')
    print(f'Test AP: {test_ap}')
    # print(train_confusion_matrix)
    print('Test Confusion Matrix:')
    print(test_confusion_matrix)
    # return test_accuracy, test_precision, test_f1, test_auc, test_ap



#### 全量训练数据的训练情况

In [9]:

tabular_model_test(x_train, y_train, x_test, y_test, model_name='xgb')

Test Accuracy: 0.9824561403508771
Test precision: 0.981651376146789
Test F1: 0.9861751152073732
Test AUC: 0.9794973544973544
Test AP: 0.9784099647692115
Test Confusion Matrix:
[[ 61   2]
 [  1 107]]


In [10]:
# 小样本训练情况
tabular_model_test(x_few_train, y_few_train, x_test, y_test, model_name='xgb')

Test Accuracy: 0.9298245614035088
Test precision: 0.9210526315789473
Test F1: 0.9459459459459458
Test AUC: 0.9146825396825398
Test AP: 0.9130116959064327
Test Confusion Matrix:
[[ 54   9]
 [  3 105]]


### SMOTE增强使用示例

In [11]:
from smote import smote_augmentation
x_synthesis, y_synthesis = smote_augmentation(x_few_train, y_few_train, 'SVMSMOTE', seed=seed,
                                              oversample_num=100, positive_ratio=None,
                                              knn_neighbors=3)
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb')


Test Accuracy: 0.9649122807017544
Test precision: 0.9553571428571429
Test F1: 0.9727272727272727
Test AUC: 0.9556878306878308
Test AP: 0.9523591966026177
Test Confusion Matrix:
[[ 58   5]
 [  1 107]]


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### Mixup增强使用示例

In [12]:
from mixup import mixup_augmentation_with_weight
method = 'vanilla'
x_synthesis, y_synthesis, sample_weight = mixup_augmentation_with_weight(
            x_few_train, y_few_train, oversample_num=200, alpha=1, beta=1, mixup_type=method, seed=seed, rebalanced_ita=1)
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb', sample_weight=sample_weight)

do vanilla mixup....
positive: 160.0 negative: 128.0
Test Accuracy: 0.9532163742690059
Test precision: 0.9464285714285714
Test F1: 0.9636363636363636
Test AUC: 0.943121693121693
Test AP: 0.9405980228348649
Test Confusion Matrix:
[[ 57   6]
 [  2 106]]


In [13]:
from mixup import mixup_augmentation_with_weight
method = 'noisy'
x_synthesis, y_synthesis, sample_weight = mixup_augmentation_with_weight(
            x_few_train, y_few_train, oversample_num=200, alpha=1, beta=1, mixup_type=method, seed=seed, rebalanced_ita=1)
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb', sample_weight=sample_weight)

do noisy mixup....
positive: 160.0 negative: 128.0
Test Accuracy: 0.9532163742690059
Test precision: 0.9310344827586207
Test F1: 0.9642857142857143
Test AUC: 0.9365079365079365
Test AP: 0.9310344827586207
Test Confusion Matrix:
[[ 55   8]
 [  0 108]]


In [14]:
from mixup import mixup_augmentation_with_weight
method = 'rebalanced'
x_synthesis, y_synthesis, sample_weight = mixup_augmentation_with_weight(
            x_few_train, y_few_train, oversample_num=200, alpha=1, beta=1, mixup_type=method, seed=seed, rebalanced_ita=0.7)
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb', sample_weight=sample_weight)

do rebalanced mixup....
positive: 160.0 negative: 128.0
Test Accuracy: 0.9532163742690059
Test precision: 0.9464285714285714
Test F1: 0.9636363636363636
Test AUC: 0.943121693121693
Test AP: 0.9405980228348649
Test Confusion Matrix:
[[ 57   6]
 [  2 106]]


### CTGAN/TVAE增强

#### CTGAN增强

In [15]:
from sdv_synthesizer import sdv_synthesis, sdv_synthesis_one_gan, sdv_synthesis_cvae
method = 'CTGAN'

x_synthesis, y_synthesis = sdv_synthesis(
            x_few_train, y_few_train, method, oversample_num=5000,
            seed=seed, init_synthesizer=True, positive_ratio=0.5,
        )
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb')


positive generator init, CTGAN
negative generator init, CTGAN
Test Accuracy: 0.9181286549707602
Test precision: 0.8852459016393442
Test F1: 0.9391304347826086
Test AUC: 0.8888888888888888
Test AP: 0.8852459016393442
Test Confusion Matrix:
[[ 49  14]
 [  0 108]]


#### TVAE增强

In [16]:
from sdv_synthesizer import sdv_synthesis, sdv_synthesis_cvae
method = 'TVAE'

x_synthesis, y_synthesis = sdv_synthesis(
            x_few_train, y_few_train, method, oversample_num=5000,
            seed=seed, init_synthesizer=True, positive_ratio=0.5,
        )
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb')


positive generator init, TVAE
negative generator init, TVAE
Test Accuracy: 0.9649122807017544
Test precision: 0.9811320754716981
Test F1: 0.9719626168224299
Test AUC: 0.9656084656084655
Test AP: 0.9681856633197249
Test Confusion Matrix:
[[ 61   2]
 [  4 104]]


In [17]:
from sdv_synthesizer import sdv_synthesis, sdv_synthesis_cvae
method = 'ConditionalTVAE'

x_synthesis, y_synthesis = sdv_synthesis_cvae(
            x_few_train, y_few_train, method, oversample_num=10000,
            seed=seed, init_synthesizer=True, positive_ratio=0.5,
        )
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb')


positive generator init, ConditionalTVAE
Test Accuracy: 0.9590643274853801
Test precision: 0.9809523809523809
Test F1: 0.9671361502347416
Test AUC: 0.9609788359788359
Test AP: 0.9647776849531234
Test Confusion Matrix:
[[ 61   2]
 [  5 103]]


In [18]:
from sdv_synthesizer import sdv_synthesis, sdv_synthesis_cvae
method = 'DeltaTVAE'

x_synthesis, y_synthesis = sdv_synthesis(
            x_few_train, y_few_train, method, oversample_num=10000,
            seed=seed, init_synthesizer=True, positive_ratio=0.5,
        )
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb')

positive generator init, DeltaTVAE
negative generator init, DeltaTVAE
Test Accuracy: 0.9473684210526315
Test precision: 0.9459459459459459
Test F1: 0.9589041095890412
Test AUC: 0.9384920634920635
Test AP: 0.9372135293187925
Test Confusion Matrix:
[[ 57   6]
 [  3 105]]


In [19]:
from sdv_synthesizer import sdv_synthesis, sdv_synthesis_cvae
method = 'DiffTVAE'

x_synthesis, y_synthesis = sdv_synthesis(
            x_few_train, y_few_train, method, oversample_num=2000,
            seed=seed, init_synthesizer=True, positive_ratio=0.5,
        )
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb')

positive generator init, DiffTVAE
negative generator init, DiffTVAE
Test Accuracy: 0.9649122807017544
Test precision: 0.9722222222222222
Test F1: 0.9722222222222222
Test AUC: 0.9623015873015873
Test AP: 0.9627599090318388
Test Confusion Matrix:
[[ 60   3]
 [  3 105]]


#### TabDDPM增强

In [None]:
from tab_ddpm.synthesis import ddpm_synthesis

method = "DDPM"

x_synthesis, y_synthesis = ddpm_synthesis(
            x_few_train, y_few_train, method, oversample_num=5000, seed=seed, init_synthesizer=True, positive_ratio=None, train_steps=10000)
tabular_model_test(x_synthesis, y_synthesis, x_test, y_test, model_name='xgb')


mlp
Step 500/10000 MLoss: 0.0 GLoss: 0.217 Sum: 0.217
Step 1000/10000 MLoss: 0.0 GLoss: 0.179 Sum: 0.179
Step 1500/10000 MLoss: 0.0 GLoss: 0.1657 Sum: 0.1657
Step 2000/10000 MLoss: 0.0 GLoss: 0.1604 Sum: 0.1604
Step 2500/10000 MLoss: 0.0 GLoss: 0.1497 Sum: 0.1497
Step 3000/10000 MLoss: 0.0 GLoss: 0.143 Sum: 0.143
Step 3500/10000 MLoss: 0.0 GLoss: 0.1387 Sum: 0.1387
Step 4000/10000 MLoss: 0.0 GLoss: 0.1264 Sum: 0.1264
Step 4500/10000 MLoss: 0.0 GLoss: 0.1237 Sum: 0.1237
Step 5000/10000 MLoss: 0.0 GLoss: 0.1098 Sum: 0.1098
Step 5500/10000 MLoss: 0.0 GLoss: 0.1104 Sum: 0.1104
Step 6000/10000 MLoss: 0.0 GLoss: 0.1065 Sum: 0.1065
Step 6500/10000 MLoss: 0.0 GLoss: 0.099 Sum: 0.099
Step 7000/10000 MLoss: 0.0 GLoss: 0.0943 Sum: 0.0943
Step 7500/10000 MLoss: 0.0 GLoss: 0.093 Sum: 0.093
