In [1]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve,confusion_matrix, auc,roc_curve
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from datetime import datetime, date, time
from functools import reduce
import catboost
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('mode.chained_assignment',  None)
pd.set_option('display.max_columns',500)


In [2]:
merge_df = pd.read_csv('merge_df.csv')
merge_df = merge_df.fillna(merge_df.mean())

In [3]:
seed = 7
skfold = RepeatedStratifiedKFold(n_splits=5,random_state=seed,n_repeats=4)

proba_score = []
predict_score = []
acc_score = []
re_score = []
prauc_score = []
specifi_score = []
cm1_score = []

curve_fprs = []
curve_tprs = []
curve_thresholds = []

curve_pre = []
curve_rec = []
curve_thr = []

tprs_ls = []
mean_fpr = np.linspace(0,1,100)

x_df = merge_df.drop(['Key','cutoff'],axis=1)
y_df = merge_df['cutoff']

sc = StandardScaler()
x_df = pd.DataFrame(sc.fit_transform(x_df), columns=x_df.columns)

for j, (train_idx, val_idx) in enumerate(skfold.split(x_df,y_df)):

    x_train, y_train = x_df.iloc[train_idx], y_df.iloc[train_idx]
    x_test, y_test = x_df.iloc[val_idx], y_df.iloc[val_idx]

    x_train_sk, x_val_sk, y_train_sk, y_val_sk = train_test_split(x_train,y_train,stratify=y_train, random_state=seed, test_size=0.2)
  
    model = catboost.CatBoostClassifier(
                                    use_best_model=True,
                                    eval_metric='AUC',
                                    verbose=False,
                                    random_state=seed
                                    )
    D_train = catboost.Pool(x_train_sk, y_train_sk)
    D_test = catboost.Pool(x_val_sk, y_val_sk)

    model.fit(D_train, eval_set=D_test, verbose=False)

    pred = model.predict_proba(x_test)[:,1]
    fprs, tprs, thresholds = roc_curve(y_test, pred)

    tprs_ls.append(np.interp(mean_fpr, fprs, tprs))

    curve_tprs.append(tprs)
    curve_fprs.append(fprs)
    curve_thresholds.append(thresholds)

    J = tprs - fprs
    ix = np.argmax(J)
    best_thresh = thresholds[ix]

    precision, recall, th = precision_recall_curve(y_test, pred)
    prauc = auc(recall, precision)

    curve_pre.append(precision)
    curve_rec.append(recall)
    curve_thr.append(th)
        
    cm1 = confusion_matrix(y_test, np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0))

    Specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    Sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(j,'번째')
    print('Youden index threshold : ',best_thresh)

    print('roc_auc_score : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    print('acc : ',accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    print('sensitivity : ',Sensitivity)
    print('specificity : ',Specificity)
    print('pr_auc : ',prauc,'\n')
    print(cm1)

    proba_score.append(roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    acc_score.append(accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    re_score.append(Sensitivity)
    specifi_score.append(Specificity)
    prauc_score.append(prauc)
    cm1_score.append(cm1)

0 번째
Youden index threshold :  0.15515048109536442
roc_auc_score :  0.7375
acc :  0.7241379310344828
sensitivity :  0.7
specificity :  0.7291666666666666
pr_auc :  0.5022289007482466 

[[35 13]
 [ 3  7]]
1 번째
Youden index threshold :  0.35891395645632435
roc_auc_score :  0.70625
acc :  0.6379310344827587
sensitivity :  0.9
specificity :  0.5833333333333334
pr_auc :  0.27451150260023394 

[[28 20]
 [ 1  9]]
2 번째
Youden index threshold :  0.446136226117855
roc_auc_score :  0.75
acc :  0.8103448275862069
sensitivity :  0.7
specificity :  0.8333333333333334
pr_auc :  0.46173495754891103 

[[40  8]
 [ 3  7]]
3 번째
Youden index threshold :  0.058079911145375106
roc_auc_score :  0.7120181405895691
acc :  0.7931034482758621
sensitivity :  0.6666666666666666
specificity :  0.8163265306122449
pr_auc :  0.452692363548418 

[[40  9]
 [ 3  6]]
4 번째
Youden index threshold :  0.18903306555914764
roc_auc_score :  0.6157407407407408
acc :  0.631578947368421
sensitivity :  0.6666666666666666
specificity 

In [4]:
print('basic : ')
print('proba_score',np.mean(proba_score))
print('re_score',np.mean(re_score))
print('specifi_score',np.mean(specifi_score))
proba_score

basic : 
proba_score 0.6556776738473167
re_score 0.68
specifi_score 0.6914328231292518


[0.7375,
 0.70625,
 0.75,
 0.7120181405895691,
 0.6157407407407408,
 0.7729166666666667,
 0.55625,
 0.5979166666666667,
 0.5204081632653061,
 0.7361111111111112,
 0.58125,
 0.675,
 0.7145833333333335,
 0.5056689342403629,
 0.7199074074074074,
 0.6145833333333334,
 0.6104166666666666,
 0.6979166666666666,
 0.6780045351473922,
 0.6111111111111112]

# SMOTE

In [5]:
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
seed = 7
skfold = RepeatedStratifiedKFold(n_splits=5,random_state=seed,n_repeats=4)

proba_score = []
predict_score = []
acc_score = []
re_score = []
prauc_score = []
specifi_score = []
cm1_score = []

curve_fprs = []
curve_tprs = []
curve_thresholds = []

curve_pre = []
curve_rec = []
curve_thr = []

tprs_ls = []
mean_fpr = np.linspace(0,1,100)

x_df = merge_df.drop(['Key','cutoff'],axis=1)
y_df = merge_df['cutoff']

sc = StandardScaler()
x_df = pd.DataFrame(sc.fit_transform(x_df), columns=x_df.columns)

for j, (train_idx, val_idx) in enumerate(skfold.split(x_df,y_df)):

    x_train, y_train = x_df.iloc[train_idx], y_df.iloc[train_idx]
    x_test, y_test = x_df.iloc[val_idx], y_df.iloc[val_idx]

    x_train_sk, x_val_sk, y_train_sk, y_val_sk = train_test_split(x_train,y_train,stratify=y_train, random_state=seed, test_size=0.2)

    #SMOTE로 타겟의 분포를 균등하게 조절
    smote = SMOTE(random_state=42)
    x_train_sk, y_train_sk = smote.fit_resample(x_train_sk, y_train_sk)
    print(' 변화된 값의 분포 :\n',pd.Series(y_train_sk).value_counts() )
    
    
    model = catboost.CatBoostClassifier(
                                    use_best_model=True,
                                    eval_metric='AUC',
                                    verbose=False,
                                    random_state=seed
                                    )
    D_train = catboost.Pool(x_train_sk, y_train_sk)
    D_test = catboost.Pool(x_val_sk, y_val_sk)

    model.fit(D_train, eval_set=D_test, verbose=False)

    pred = model.predict_proba(x_test)[:,1]
    fprs, tprs, thresholds = roc_curve(y_test, pred)

    tprs_ls.append(np.interp(mean_fpr, fprs, tprs))

    curve_tprs.append(tprs)
    curve_fprs.append(fprs)
    curve_thresholds.append(thresholds)

    J = tprs - fprs
    ix = np.argmax(J)
    best_thresh = thresholds[ix]

    precision, recall, th = precision_recall_curve(y_test, pred)
    prauc = auc(recall, precision)

    curve_pre.append(precision)
    curve_rec.append(recall)
    curve_thr.append(th)
        
    cm1 = confusion_matrix(y_test, np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0))

    Specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    Sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(j,'번째')
    print('Youden index threshold : ',best_thresh)

    print('roc_auc_score : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    print('acc : ',accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    print('sensitivity : ',Sensitivity)
    print('specificity : ',Specificity)
    print('pr_auc : ',prauc,'\n')
    print(cm1)

    proba_score.append(roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    acc_score.append(accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    re_score.append(Sensitivity)
    specifi_score.append(Specificity)
    prauc_score.append(prauc)
    cm1_score.append(cm1)

 변화된 값의 분포 :
 0    154
1    154
Name: cutoff, dtype: int64
0 번째
Youden index threshold :  0.0344497638597603
roc_auc_score :  0.6520833333333332
acc :  0.5517241379310345
sensitivity :  0.9
specificity :  0.4791666666666667
pr_auc :  0.2532928534844663 

[[23 25]
 [ 1  9]]
 변화된 값의 분포 :
 0    154
1    154
Name: cutoff, dtype: int64
1 번째
Youden index threshold :  0.41662571828853057
roc_auc_score :  0.8500000000000001
acc :  0.7241379310344828
sensitivity :  1.0
specificity :  0.6666666666666666
pr_auc :  0.5765419480050318 

[[32 16]
 [ 0 10]]
 변화된 값의 분포 :
 0    154
1    154
Name: cutoff, dtype: int64
2 번째
Youden index threshold :  0.463690008753108
roc_auc_score :  0.5770833333333334
acc :  0.46551724137931033
sensitivity :  0.8
specificity :  0.3958333333333333
pr_auc :  0.19354203607689974 

[[19 29]
 [ 2  8]]
 변화된 값의 분포 :
 0    153
1    153
Name: cutoff, dtype: int64
3 번째
Youden index threshold :  0.5126542105292249
roc_auc_score :  0.528344671201814
acc :  0.8793103448275862
sensit

In [6]:
print('SMOTE 5 4 : ')
print('proba_score',np.mean(proba_score))
print('re_score',np.mean(re_score))
print('specifi_score',np.mean(specifi_score))
proba_score

SMOTE 5 4 : 
proba_score 0.6671921060090703
re_score 0.6894444444444445
specifi_score 0.7111181972789116


[0.6520833333333332,
 0.8500000000000001,
 0.5770833333333334,
 0.528344671201814,
 0.5717592592592592,
 0.7041666666666666,
 0.66875,
 0.6666666666666666,
 0.7165532879818595,
 0.6365740740740741,
 0.7333333333333333,
 0.5916666666666667,
 0.6604166666666667,
 0.5419501133786848,
 0.7615740740740741,
 0.6354166666666666,
 0.6010416666666667,
 0.6479166666666667,
 0.7142857142857143,
 0.8842592592592593]

# BorderlineSMOTE

In [7]:
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
seed = 7
skfold = RepeatedStratifiedKFold(n_splits=5,random_state=seed,n_repeats=4)

proba_score = []
predict_score = []
acc_score = []
re_score = []
prauc_score = []
specifi_score = []
cm1_score = []

curve_fprs = []
curve_tprs = []
curve_thresholds = []

curve_pre = []
curve_rec = []
curve_thr = []

tprs_ls = []
mean_fpr = np.linspace(0,1,100)

x_df = merge_df.drop(['Key','cutoff'],axis=1)
y_df = merge_df['cutoff']

sc = StandardScaler()
x_df = pd.DataFrame(sc.fit_transform(x_df), columns=x_df.columns)

for j, (train_idx, val_idx) in enumerate(skfold.split(x_df,y_df)):

    x_train, y_train = x_df.iloc[train_idx], y_df.iloc[train_idx]
    x_test, y_test = x_df.iloc[val_idx], y_df.iloc[val_idx]

    x_train_sk, x_val_sk, y_train_sk, y_val_sk = train_test_split(x_train,y_train,stratify=y_train, random_state=seed, test_size=0.2)

    bsmote = BorderlineSMOTE(random_state=42)
    x_train_sk, y_train_sk = bsmote.fit_resample(x_train_sk, y_train_sk)
    print(' 변화된 값의 분포 :\n',pd.Series(y_train_sk).value_counts() )
    
    
    model = catboost.CatBoostClassifier(
                                    use_best_model=True,
                                    eval_metric='AUC',
                                    verbose=False,
                                    random_state=seed
                                    )
    D_train = catboost.Pool(x_train_sk, y_train_sk)
    D_test = catboost.Pool(x_val_sk, y_val_sk)

    model.fit(D_train, eval_set=D_test, verbose=False)

    pred = model.predict_proba(x_test)[:,1]
    fprs, tprs, thresholds = roc_curve(y_test, pred)

    tprs_ls.append(np.interp(mean_fpr, fprs, tprs))

    curve_tprs.append(tprs)
    curve_fprs.append(fprs)
    curve_thresholds.append(thresholds)

    J = tprs - fprs
    ix = np.argmax(J)
    best_thresh = thresholds[ix]

    precision, recall, th = precision_recall_curve(y_test, pred)
    prauc = auc(recall, precision)

    curve_pre.append(precision)
    curve_rec.append(recall)
    curve_thr.append(th)
        
    cm1 = confusion_matrix(y_test, np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0))

    Specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    Sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(j,'번째')
    print('Youden index threshold : ',best_thresh)

    print('roc_auc_score : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    print('acc : ',accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    print('sensitivity : ',Sensitivity)
    print('specificity : ',Specificity)
    print('pr_auc : ',prauc,'\n')
    print(cm1)

    proba_score.append(roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    acc_score.append(accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    re_score.append(Sensitivity)
    specifi_score.append(Specificity)
    prauc_score.append(prauc)
    cm1_score.append(cm1)

 변화된 값의 분포 :
 0    154
1    154
Name: cutoff, dtype: int64
0 번째
Youden index threshold :  0.4850520247407377
roc_auc_score :  0.603125
acc :  0.5172413793103449
sensitivity :  0.8
specificity :  0.4583333333333333
pr_auc :  0.21898405929637133 

[[22 26]
 [ 2  8]]
 변화된 값의 분포 :
 0    154
1    154
Name: cutoff, dtype: int64
1 번째
Youden index threshold :  0.18540590859145598
roc_auc_score :  0.7791666666666667
acc :  0.6551724137931034
sensitivity :  1.0
specificity :  0.5833333333333334
pr_auc :  0.3936551205566632 

[[28 20]
 [ 0 10]]
 변화된 값의 분포 :
 0    154
1    154
Name: cutoff, dtype: int64
2 번째
Youden index threshold :  0.08014971433882917
roc_auc_score :  0.7000000000000001
acc :  0.7586206896551724
sensitivity :  0.7
specificity :  0.7708333333333334
pr_auc :  0.39678994473275153 

[[37 11]
 [ 3  7]]
 변화된 값의 분포 :
 0    153
1    153
Name: cutoff, dtype: int64
3 번째
Youden index threshold :  0.47016728563411925
roc_auc_score :  0.5374149659863946
acc :  0.7586206896551724
sensitivity 

In [8]:
print('BorderlineSMOTE-1 5 4 : ')
print('proba_score',np.mean(proba_score))
print('re_score',np.mean(re_score))
print('specifi_score',np.mean(specifi_score))
proba_score

BorderlineSMOTE-1 5 4 : 
proba_score 0.6649939767573695
re_score 0.6783333333333335
specifi_score 0.7031887755102042


[0.603125,
 0.7791666666666667,
 0.7000000000000001,
 0.5374149659863946,
 0.6342592592592593,
 0.5708333333333333,
 0.7041666666666666,
 0.65625,
 0.5873015873015872,
 0.8078703703703703,
 0.7104166666666667,
 0.5625,
 0.6437500000000002,
 0.5986394557823129,
 0.7268518518518519,
 0.6666666666666666,
 0.6583333333333333,
 0.575,
 0.7324263038548753,
 0.8449074074074073]

# ADASYN

In [4]:
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
seed = 7
skfold = RepeatedStratifiedKFold(n_splits=5,random_state=seed,n_repeats=4)

proba_score = []
predict_score = []
acc_score = []
re_score = []
prauc_score = []
specifi_score = []
cm1_score = []

curve_fprs = []
curve_tprs = []
curve_thresholds = []

curve_pre = []
curve_rec = []
curve_thr = []

tprs_ls = []
mean_fpr = np.linspace(0,1,100)

x_df = merge_df.drop(['Key','cutoff'],axis=1)
y_df = merge_df['cutoff']

sc = StandardScaler()
x_df = pd.DataFrame(sc.fit_transform(x_df), columns=x_df.columns)

for j, (train_idx, val_idx) in enumerate(skfold.split(x_df,y_df)):

    x_train, y_train = x_df.iloc[train_idx], y_df.iloc[train_idx]
    x_test, y_test = x_df.iloc[val_idx], y_df.iloc[val_idx]

    x_train_sk, x_val_sk, y_train_sk, y_val_sk = train_test_split(x_train,y_train,stratify=y_train, random_state=seed, test_size=0.2)

    adasyn = ADASYN(random_state = 42)
    x_train_sk, y_train_sk = adasyn.fit_resample(x_train_sk, y_train_sk)
    print(' 변화된 값의 분포 :\n',pd.Series(y_train_sk).value_counts() )
    
    
    model = catboost.CatBoostClassifier(
                                    use_best_model=True,
                                    eval_metric='AUC',
                                    verbose=False,
                                    random_state=seed
                                    )
    D_train = catboost.Pool(x_train_sk, y_train_sk)
    D_test = catboost.Pool(x_val_sk, y_val_sk)

    model.fit(D_train, eval_set=D_test, verbose=False)

    pred = model.predict_proba(x_test)[:,1]
    fprs, tprs, thresholds = roc_curve(y_test, pred)

    tprs_ls.append(np.interp(mean_fpr, fprs, tprs))

    curve_tprs.append(tprs)
    curve_fprs.append(fprs)
    curve_thresholds.append(thresholds)

    J = tprs - fprs
    ix = np.argmax(J)
    best_thresh = thresholds[ix]

    precision, recall, th = precision_recall_curve(y_test, pred)
    prauc = auc(recall, precision)

    curve_pre.append(precision)
    curve_rec.append(recall)
    curve_thr.append(th)
        
    cm1 = confusion_matrix(y_test, np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0))

    Specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    Sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(j,'번째')
    print('Youden index threshold : ',best_thresh)

    print('roc_auc_score : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    print('acc : ',accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    print('sensitivity : ',Sensitivity)
    print('specificity : ',Specificity)
    print('pr_auc : ',prauc,'\n')
    print(cm1)

    proba_score.append(roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    acc_score.append(accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    re_score.append(Sensitivity)
    specifi_score.append(Specificity)
    prauc_score.append(prauc)
    cm1_score.append(cm1)

 변화된 값의 분포 :
 0    154
1    148
Name: cutoff, dtype: int64
0 번째
Youden index threshold :  0.12478781419090107
roc_auc_score :  0.6625
acc :  0.5344827586206896
sensitivity :  0.9
specificity :  0.4583333333333333
pr_auc :  0.31694120157639605 

[[22 26]
 [ 1  9]]
 변화된 값의 분포 :
 1    157
0    154
Name: cutoff, dtype: int64
1 번째
Youden index threshold :  0.18186855310118483
roc_auc_score :  0.8604166666666666
acc :  0.7586206896551724
sensitivity :  0.9
specificity :  0.7291666666666666
pr_auc :  0.42936156572185985 

[[35 13]
 [ 1  9]]
 변화된 값의 분포 :
 1    162
0    154
Name: cutoff, dtype: int64
2 번째
Youden index threshold :  0.23479581402723596
roc_auc_score :  0.6583333333333333
acc :  0.41379310344827586
sensitivity :  1.0
specificity :  0.2916666666666667
pr_auc :  0.34743587279287547 

[[14 34]
 [ 0 10]]
 변화된 값의 분포 :
 1    166
0    153
Name: cutoff, dtype: int64
3 번째
Youden index threshold :  0.3233529594824198
roc_auc_score :  0.6575963718820862
acc :  0.7068965517241379
sensitivity 

In [5]:
print('adasyn : ')
print('proba_score',np.mean(proba_score))
print('re_score',np.mean(re_score))
print('specifi_score',np.mean(specifi_score))
proba_score

adasyn : 
proba_score 0.6698430413832199
re_score 0.7094444444444445
specifi_score 0.6692602040816327


[0.6625,
 0.8604166666666666,
 0.6583333333333333,
 0.6575963718820862,
 0.6574074074074073,
 0.6583333333333334,
 0.7145833333333333,
 0.6291666666666667,
 0.41043083900226757,
 0.6736111111111112,
 0.7229166666666667,
 0.5854166666666667,
 0.6552083333333333,
 0.5986394557823129,
 0.6203703703703703,
 0.6708333333333333,
 0.7541666666666667,
 0.6520833333333333,
 0.7006802721088435,
 0.8541666666666666]

# SMOTETomek

In [11]:
from imblearn.combine import SMOTETomek 
seed = 7
skfold = RepeatedStratifiedKFold(n_splits=5,random_state=seed,n_repeats=4)

proba_score = []
predict_score = []
acc_score = []
re_score = []
prauc_score = []
specifi_score = []
cm1_score = []

curve_fprs = []
curve_tprs = []
curve_thresholds = []

curve_pre = []
curve_rec = []
curve_thr = []

tprs_ls = []
mean_fpr = np.linspace(0,1,100)

x_df = merge_df.drop(['Key','cutoff'],axis=1)
y_df = merge_df['cutoff']

sc = StandardScaler()
x_df = pd.DataFrame(sc.fit_transform(x_df), columns=x_df.columns)

for j, (train_idx, val_idx) in enumerate(skfold.split(x_df,y_df)):

    x_train, y_train = x_df.iloc[train_idx], y_df.iloc[train_idx]
    x_test, y_test = x_df.iloc[val_idx], y_df.iloc[val_idx]

    x_train_sk, x_val_sk, y_train_sk, y_val_sk = train_test_split(x_train,y_train,stratify=y_train, random_state=seed, test_size=0.2)

    s_tomek = SMOTETomek(random_state = 42)
    x_train, y_train = s_tomek.fit_resample(x_train, y_train)
    print(' 변화된 값의 분포 :\n',pd.Series(y_train_sk).value_counts() )
    
    
    model = catboost.CatBoostClassifier(
                                    use_best_model=True,
                                    eval_metric='AUC',
                                    verbose=False,
                                    random_state=seed
                                    )
    D_train = catboost.Pool(x_train_sk, y_train_sk)
    D_test = catboost.Pool(x_val_sk, y_val_sk)

    model.fit(D_train, eval_set=D_test, verbose=False)

    pred = model.predict_proba(x_test)[:,1]
    fprs, tprs, thresholds = roc_curve(y_test, pred)

    tprs_ls.append(np.interp(mean_fpr, fprs, tprs))

    curve_tprs.append(tprs)
    curve_fprs.append(fprs)
    curve_thresholds.append(thresholds)

    J = tprs - fprs
    ix = np.argmax(J)
    best_thresh = thresholds[ix]

    precision, recall, th = precision_recall_curve(y_test, pred)
    prauc = auc(recall, precision)

    curve_pre.append(precision)
    curve_rec.append(recall)
    curve_thr.append(th)
        
    cm1 = confusion_matrix(y_test, np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0))

    Specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    Sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(j,'번째')
    print('Youden index threshold : ',best_thresh)

    print('roc_auc_score : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    print('acc : ',accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    print('sensitivity : ',Sensitivity)
    print('specificity : ',Specificity)
    print('pr_auc : ',prauc,'\n')
    print(cm1)

    proba_score.append(roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    acc_score.append(accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    re_score.append(Sensitivity)
    specifi_score.append(Specificity)
    prauc_score.append(prauc)
    cm1_score.append(cm1)

 변화된 값의 분포 :
 0    154
1     30
Name: cutoff, dtype: int64
0 번째
Youden index threshold :  0.15515048109536442
roc_auc_score :  0.7375
acc :  0.7241379310344828
sensitivity :  0.7
specificity :  0.7291666666666666
pr_auc :  0.5022289007482466 

[[35 13]
 [ 3  7]]
 변화된 값의 분포 :
 0    154
1     30
Name: cutoff, dtype: int64
1 번째
Youden index threshold :  0.35891395645632435
roc_auc_score :  0.70625
acc :  0.6379310344827587
sensitivity :  0.9
specificity :  0.5833333333333334
pr_auc :  0.27451150260023394 

[[28 20]
 [ 1  9]]
 변화된 값의 분포 :
 0    154
1     30
Name: cutoff, dtype: int64
2 번째
Youden index threshold :  0.446136226117855
roc_auc_score :  0.75
acc :  0.8103448275862069
sensitivity :  0.7
specificity :  0.8333333333333334
pr_auc :  0.46173495754891103 

[[40  8]
 [ 3  7]]
 변화된 값의 분포 :
 0    153
1     31
Name: cutoff, dtype: int64
3 번째
Youden index threshold :  0.058079911145375106
roc_auc_score :  0.7120181405895691
acc :  0.7931034482758621
sensitivity :  0.6666666666666666
speci

In [12]:
print('SMOTETomek 5 4 : ')
print('proba_score',np.mean(proba_score))
print('re_score',np.mean(re_score))
print('specifi_score',np.mean(specifi_score))
proba_score

SMOTETomek 5 4 : 
proba_score 0.6556776738473167
re_score 0.68
specifi_score 0.6914328231292518


[0.7375,
 0.70625,
 0.75,
 0.7120181405895691,
 0.6157407407407408,
 0.7729166666666667,
 0.55625,
 0.5979166666666667,
 0.5204081632653061,
 0.7361111111111112,
 0.58125,
 0.675,
 0.7145833333333335,
 0.5056689342403629,
 0.7199074074074074,
 0.6145833333333334,
 0.6104166666666666,
 0.6979166666666666,
 0.6780045351473922,
 0.6111111111111112]

# SMOTEENN

In [13]:
from imblearn.combine import SMOTEENN 
seed = 7
skfold = RepeatedStratifiedKFold(n_splits=5,random_state=seed,n_repeats=4)

proba_score = []
predict_score = []
acc_score = []
re_score = []
prauc_score = []
specifi_score = []
cm1_score = []

curve_fprs = []
curve_tprs = []
curve_thresholds = []

curve_pre = []
curve_rec = []
curve_thr = []

tprs_ls = []
mean_fpr = np.linspace(0,1,100)

x_df = merge_df.drop(['Key','cutoff'],axis=1)
y_df = merge_df['cutoff']

sc = StandardScaler()
x_df = pd.DataFrame(sc.fit_transform(x_df), columns=x_df.columns)

for j, (train_idx, val_idx) in enumerate(skfold.split(x_df,y_df)):

    x_train, y_train = x_df.iloc[train_idx], y_df.iloc[train_idx]
    x_test, y_test = x_df.iloc[val_idx], y_df.iloc[val_idx]

    x_train_sk, x_val_sk, y_train_sk, y_val_sk = train_test_split(x_train,y_train,stratify=y_train, random_state=seed, test_size=0.2)

    s_enn = SMOTEENN(random_state=42)
    x_train, y_train = s_enn.fit_resample(x_train, y_train)
    print(' 변화된 값의 분포 :\n',pd.Series(y_train).value_counts() )
    
    
    model = catboost.CatBoostClassifier(
                                    use_best_model=True,
                                    eval_metric='AUC',
                                    verbose=False,
                                    random_state=seed
                                    )
    D_train = catboost.Pool(x_train_sk, y_train_sk)
    D_test = catboost.Pool(x_val_sk, y_val_sk)

    model.fit(D_train, eval_set=D_test, verbose=False)

    pred = model.predict_proba(x_test)[:,1]
    fprs, tprs, thresholds = roc_curve(y_test, pred)

    tprs_ls.append(np.interp(mean_fpr, fprs, tprs))

    curve_tprs.append(tprs)
    curve_fprs.append(fprs)
    curve_thresholds.append(thresholds)

    J = tprs - fprs
    ix = np.argmax(J)
    best_thresh = thresholds[ix]

    precision, recall, th = precision_recall_curve(y_test, pred)
    prauc = auc(recall, precision)

    curve_pre.append(precision)
    curve_rec.append(recall)
    curve_thr.append(th)
        
    cm1 = confusion_matrix(y_test, np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0))

    Specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    Sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(j,'번째')
    print('Youden index threshold : ',best_thresh)

    print('roc_auc_score : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    print('acc : ',accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    print('sensitivity : ',Sensitivity)
    print('specificity : ',Specificity)
    print('pr_auc : ',prauc,'\n')
    print(cm1)

    proba_score.append(roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    acc_score.append(accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    re_score.append(Sensitivity)
    specifi_score.append(Specificity)
    prauc_score.append(prauc)
    cm1_score.append(cm1)

 변화된 값의 분포 :
 1    190
0     37
Name: cutoff, dtype: int64
0 번째
Youden index threshold :  0.15515048109536442
roc_auc_score :  0.7375
acc :  0.7241379310344828
sensitivity :  0.7
specificity :  0.7291666666666666
pr_auc :  0.5022289007482466 

[[35 13]
 [ 3  7]]
 변화된 값의 분포 :
 1    185
0     36
Name: cutoff, dtype: int64
1 번째
Youden index threshold :  0.35891395645632435
roc_auc_score :  0.70625
acc :  0.6379310344827587
sensitivity :  0.9
specificity :  0.5833333333333334
pr_auc :  0.27451150260023394 

[[28 20]
 [ 1  9]]
 변화된 값의 분포 :
 1    186
0     53
Name: cutoff, dtype: int64
2 번째
Youden index threshold :  0.446136226117855
roc_auc_score :  0.75
acc :  0.8103448275862069
sensitivity :  0.7
specificity :  0.8333333333333334
pr_auc :  0.46173495754891103 

[[40  8]
 [ 3  7]]
 변화된 값의 분포 :
 1    184
0     42
Name: cutoff, dtype: int64
3 번째
Youden index threshold :  0.058079911145375106
roc_auc_score :  0.7120181405895691
acc :  0.7931034482758621
sensitivity :  0.6666666666666666
speci

In [14]:
print('SMOTEENN 5 4 : ')
print('proba_score',np.mean(proba_score))
print('re_score',np.mean(re_score))
print('specifi_score',np.mean(specifi_score))
proba_score

SMOTEENN 5 4 : 
proba_score 0.6556776738473167
re_score 0.68
specifi_score 0.6914328231292518


[0.7375,
 0.70625,
 0.75,
 0.7120181405895691,
 0.6157407407407408,
 0.7729166666666667,
 0.55625,
 0.5979166666666667,
 0.5204081632653061,
 0.7361111111111112,
 0.58125,
 0.675,
 0.7145833333333335,
 0.5056689342403629,
 0.7199074074074074,
 0.6145833333333334,
 0.6104166666666666,
 0.6979166666666666,
 0.6780045351473922,
 0.6111111111111112]

# RandomOverSampler

In [15]:
from imblearn.over_sampling import RandomOverSampler
seed = 7
skfold = RepeatedStratifiedKFold(n_splits=5,random_state=seed,n_repeats=4)

proba_score = []
predict_score = []
acc_score = []
re_score = []
prauc_score = []
specifi_score = []
cm1_score = []

curve_fprs = []
curve_tprs = []
curve_thresholds = []

curve_pre = []
curve_rec = []
curve_thr = []

tprs_ls = []
mean_fpr = np.linspace(0,1,100)

x_df = merge_df.drop(['Key','cutoff'],axis=1)
y_df = merge_df['cutoff']

sc = StandardScaler()
x_df = pd.DataFrame(sc.fit_transform(x_df), columns=x_df.columns)

for j, (train_idx, val_idx) in enumerate(skfold.split(x_df,y_df)):

    x_train, y_train = x_df.iloc[train_idx], y_df.iloc[train_idx]
    x_test, y_test = x_df.iloc[val_idx], y_df.iloc[val_idx]

    x_train_sk, x_val_sk, y_train_sk, y_val_sk = train_test_split(x_train,y_train,stratify=y_train, random_state=seed, test_size=0.2)

    oversample = RandomOverSampler(sampling_strategy='minority')
    x_train, y_train = oversample.fit_resample(x_train, y_train)
    print(' 변화된 값의 분포 :\n',pd.Series(y_train).value_counts() )
    
    
    model = catboost.CatBoostClassifier(
                                    use_best_model=True,
                                    eval_metric='AUC',
                                    verbose=False,
                                    random_state=seed
                                    )
    D_train = catboost.Pool(x_train_sk, y_train_sk)
    D_test = catboost.Pool(x_val_sk, y_val_sk)

    model.fit(D_train, eval_set=D_test, verbose=False)

    pred = model.predict_proba(x_test)[:,1]
    fprs, tprs, thresholds = roc_curve(y_test, pred)

    tprs_ls.append(np.interp(mean_fpr, fprs, tprs))

    curve_tprs.append(tprs)
    curve_fprs.append(fprs)
    curve_thresholds.append(thresholds)

    J = tprs - fprs
    ix = np.argmax(J)
    best_thresh = thresholds[ix]

    precision, recall, th = precision_recall_curve(y_test, pred)
    prauc = auc(recall, precision)

    curve_pre.append(precision)
    curve_rec.append(recall)
    curve_thr.append(th)
        
    cm1 = confusion_matrix(y_test, np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0))

    Specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    Sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(j,'번째')
    print('Youden index threshold : ',best_thresh)

    print('roc_auc_score : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    print('acc : ',accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    print('sensitivity : ',Sensitivity)
    print('specificity : ',Specificity)
    print('pr_auc : ',prauc,'\n')
    print(cm1)

    proba_score.append(roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))
    acc_score.append(accuracy_score(y_test,np.where(model.predict_proba(x_test)[:,1] >= best_thresh , 1, 0)))
    re_score.append(Sensitivity)
    specifi_score.append(Specificity)
    prauc_score.append(prauc)
    cm1_score.append(cm1)

 변화된 값의 분포 :
 1    193
0    193
Name: cutoff, dtype: int64
0 번째
Youden index threshold :  0.15515048109536442
roc_auc_score :  0.7375
acc :  0.7241379310344828
sensitivity :  0.7
specificity :  0.7291666666666666
pr_auc :  0.5022289007482466 

[[35 13]
 [ 3  7]]
 변화된 값의 분포 :
 0    193
1    193
Name: cutoff, dtype: int64
1 번째
Youden index threshold :  0.35891395645632435
roc_auc_score :  0.70625
acc :  0.6379310344827587
sensitivity :  0.9
specificity :  0.5833333333333334
pr_auc :  0.27451150260023394 

[[28 20]
 [ 1  9]]
 변화된 값의 분포 :
 1    193
0    193
Name: cutoff, dtype: int64
2 번째
Youden index threshold :  0.446136226117855
roc_auc_score :  0.75
acc :  0.8103448275862069
sensitivity :  0.7
specificity :  0.8333333333333334
pr_auc :  0.46173495754891103 

[[40  8]
 [ 3  7]]
 변화된 값의 분포 :
 1    192
0    192
Name: cutoff, dtype: int64
3 번째
Youden index threshold :  0.058079911145375106
roc_auc_score :  0.7120181405895691
acc :  0.7931034482758621
sensitivity :  0.6666666666666666
speci

In [16]:
print('RandomOverSampler 5 4 : ')
print('proba_score',np.mean(proba_score))
print('re_score',np.mean(re_score))
print('specifi_score',np.mean(specifi_score))
proba_score

RandomOverSampler 5 4 : 
proba_score 0.6556776738473167
re_score 0.68
specifi_score 0.6914328231292518


[0.7375,
 0.70625,
 0.75,
 0.7120181405895691,
 0.6157407407407408,
 0.7729166666666667,
 0.55625,
 0.5979166666666667,
 0.5204081632653061,
 0.7361111111111112,
 0.58125,
 0.675,
 0.7145833333333335,
 0.5056689342403629,
 0.7199074074074074,
 0.6145833333333334,
 0.6104166666666666,
 0.6979166666666666,
 0.6780045351473922,
 0.6111111111111112]