In [1]:
import numpy as np
import pandas as pd
import imblearn
import sklearn

for i in [np, pd, imblearn, sklearn]:
    print(i.__name__, i.__version__)

numpy 1.18.5
pandas 0.25.1
imblearn 0.5.0
sklearn 0.21.3


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

# imblearn

Oversampling / Undersampling을 위한 툴 킷


**Resampler**

 sklearn의 transformer의 process는 fit -> transform 구조를 가지고 있는 반면, 
 
 imblearn의 resampler는 fit -> resample 구조를 가지고 있습니다.
 
 X_resample, y_resample = imblearn_obj.fit_resample(df_input, df_target)
 
**주요 매개 변수**

 sampling_strategy: 
 
 > float일 경우, 이진 분류에만 유효하고, OverSampling일 때는 $\frac{소수 클래스의 샘플링 후의 개수}{다수 클래스의 개수}$, UnderSampling일 때는 $\frac{소수 클래스의 개수}{다수 클래스의 샘플링 수의 개수}$
 
 > dict일 경우, {Class: 빈도수} 의 구조. 리샘플링 후의 각 클래스의 샘플 수
 
 > 'minority': 가장 빈도가 적은 클래스만 리샘플링하여 빈도수를 조정한다. (OverSampling 경우만 유효)
 
 > 'majority': 가장 빈도가 많은 클래스만 리샘플링하여 빈도수 조정. (Undersampling 경우만 유효)
    
 > 'not minority': 가장 적은 클래스를 제외하고 샘플링(Oversamping일 때는 가장 빈도수가 높은 클래스로, UnderSamping 경우에는 가장 낮는 클래스의 수로 샘플을 다시 뽑는다.)
 
 > 'not majority': 가장 많은 클래스를 제외하고 샘플링(Oversamping일 때는 가장 빈도수가 높은 클래스로, UnderSamping 경우에는 가장 낮는 클래스의 수로 샘플을 다시 뽑는다.)
 
 > 'auto': 자동 - Oversamlpling일 때는 not majority , Undersampling일 때는 not minority
 
**시험장 환경의 imblearn의 특징**

 최신 버젼 imblearn은 입력을 pandas.DataFrame으로 하면 출력도 동일하게 DataFrame으로 해주지만
 
 시험장 버젼은 입력은 입력 데이터 형식에 상관없이 numpy.ndarray로 반환을 합니다.
    
Ex)

```python
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_resample, y_resample = smote.fit_resample(df_train[X], df_train[y])
```


# 샘플링(Sampling) 해보기

## 이진 분류(Binary Classification)

In [3]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, n_classes=2,
                           n_clusters_per_class=1,
                           weights=[0.01, 0.99],
                           class_sep=0.8, random_state=0)
pd.Series(y).value_counts()

1    4923
0      77
dtype: int64

In [4]:
from imblearn.over_sampling import RandomOverSampler
sampling_strategies = [0.5, 0.75, 1.0, 'minority', 'not minority', 'not majority', 'all', 'auto']

val_cnts = {'original': pd.Series(y).value_counts()}
for i in sampling_strategies:
    _, y_re = RandomOverSampler(sampling_strategy=i).fit_resample(X, y)
    val_cnts[i] = pd.Series(y_re).value_counts()
pd.DataFrame(val_cnts)

Unnamed: 0,original,0.5,0.75,1.0,minority,not minority,not majority,all,auto
1,4923,4923,4923,4923,4923,4923,4923,4923,4923
0,77,2461,3692,4923,4923,77,4923,4923,4923


In [5]:
from imblearn.under_sampling import RandomUnderSampler
sampling_strategies = [0.5, 0.75, 1.0, 'majority', 'not minority', 'not majority', 'all', 'auto']

val_cnts = {'original': pd.Series(y).value_counts()}
for i in sampling_strategies:
    _, y_re = RandomUnderSampler(sampling_strategy=i).fit_resample(X, y)
    val_cnts[i] = pd.Series(y_re).value_counts()
pd.DataFrame(val_cnts)

Unnamed: 0,original,0.5,0.75,1.0,majority,not minority,not majority,all,auto
1,4923,154,102,77,77,77,4923,77,77
0,77,77,77,77,77,77,77,77,77


## 다중 분류(Multi-Class Classification)

In [6]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, n_classes=3,
                           n_clusters_per_class=1,
                           weights=[0.01, 0.99],
                           class_sep=0.8, random_state=0)
pd.Series(y).value_counts()

1    4916
0      64
2      20
dtype: int64

In [7]:
sampling_strategies = [{0: 100, 2: 50}, 'minority', 'not minority', 'not majority', 'all', 'auto']

val_cnts = {'original': pd.Series(y).value_counts()}
for i in sampling_strategies:
    _, y_re = RandomOverSampler(sampling_strategy=i).fit_resample(X, y)
    val_cnts[str(i)] = pd.Series(y_re).value_counts()
pd.DataFrame(val_cnts)

Unnamed: 0,original,"{0: 100, 2: 50}",minority,not minority,not majority,all,auto
0,64,100,64,4916,4916,4916,4916
1,4916,4916,4916,4916,4916,4916,4916
2,20,50,4916,20,4916,4916,4916


In [8]:
sampling_strategies = [{1: 50, 0: 30}, 'majority', 'not minority', 'not majority', 'all', 'auto']

val_cnts = {'original': pd.Series(y).value_counts()}
for i in sampling_strategies:
    _, y_re = RandomUnderSampler(sampling_strategy=i).fit_resample(X, y)
    val_cnts[str(i)] = pd.Series(y_re).value_counts()
pd.DataFrame(val_cnts)

Unnamed: 0,original,"{1: 50, 0: 30}",majority,not minority,not majority,all,auto
0,64,30,64,20,20,20,20
1,4916,50,20,20,4916,20,20
2,20,20,20,20,20,20,20


# 실제 예제

## 이진 분류 (Binary Classification) : Stroke

In [9]:
df_stroke = pd.read_csv('data/healthcare-dataset-stroke-data.csv', index_col='id')
df_stroke.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


심장마비 데이터셋입니다

stroke : 대상 변수(0, 1)

지표: 
$f_1score=\frac{2 \cdot Precision \cdot Recall}{Precision + Recall}$ . positive label은 1 

In [10]:
df_stroke['stroke'].value_counts(), df_stroke['stroke'].value_counts(normalize=True)

(0    4861
 1     249
 Name: stroke, dtype: int64,
 0    0.951272
 1    0.048728
 Name: stroke, dtype: float64)

stroke가 1인 경우가 5%인 target class에 치우침이 있는 데이터 셋입니다.

In [11]:
df_stroke.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

BMI에 201건의 NA가 존재합니다.

In [12]:
# 관측된 값의 종류가 10개 미만인 변수명을 가져옵니다.
cat_cols = df_stroke.apply(lambda x: x.nunique()).pipe(lambda x: x.loc[x < 10]).index.tolist()
cat_cols

['gender',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'smoking_status',
 'stroke']

In [13]:
# 관측된 값의 종류가 10개 미만인 변수들의 값의 출현 빈도를 출력
df_stroke[cat_cols].apply(lambda x: x.value_counts()).unstack().dropna().to_frame()

Unnamed: 0,Unnamed: 1,0
gender,Female,2994.0
gender,Male,2115.0
gender,Other,1.0
hypertension,0,4612.0
hypertension,1,498.0
heart_disease,0,4834.0
heart_disease,1,276.0
ever_married,No,1757.0
ever_married,Yes,3353.0
work_type,Govt_job,657.0


In [14]:
# gender에 Other는 1건이므로 가장 빈도수가 큰 Male로 변경합니다.
df_stroke['gender'] = df_stroke['gender'].replace({'Other': 'Male'})
# bmi의 결측여부를 isna_bmi에 저장합니다.
df_stroke['isna_bmi'] = df_stroke['bmi'].isna()
# bmi의 결측치는 평균으로 대치합니다.
df_stroke['bmi'] = df_stroke['bmi'].fillna(df_stroke['bmi'].mean())

In [15]:
# 결측 처리 결과를 확인합니다.
df_stroke.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
isna_bmi             0
dtype: int64

In [16]:
# 빈도수 처리 결과를 확인합니다.
df_stroke[cat_cols].apply(lambda x: x.value_counts()).unstack().dropna().to_frame()

Unnamed: 0,Unnamed: 1,0
gender,Female,2994.0
gender,Male,2116.0
hypertension,0,4612.0
hypertension,1,498.0
heart_disease,0,4834.0
heart_disease,1,276.0
ever_married,No,1757.0
ever_married,Yes,3353.0
work_type,Govt_job,657.0
work_type,Never_worked,22.0


In [17]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

# 5-fold cross validation
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

In [18]:
X_stroke = [i for i in df_stroke.columns if i != 'stroke']
scores_ =[]

ct_stroke_rf = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['gender', 'ever_married', 'work_type', 'smoking_status', 'Residence_type', 'isna_bmi']),
    ('pt', 'passthrough', ['age', 'avg_glucose_level', 'bmi']), # 처리 없이 통과
])
clf_stroke_rf = make_pipeline(ct_stroke_rf, 
                              RandomForestClassifier(max_depth=11, n_estimators=100, random_state=123))

scores_ = []
# 데이터 그대로, 교차 검증 수행
for train_idx, test_idx in cv.split(df_stroke[X_stroke], df_stroke['stroke']):
    df_cv_train, df_cv_test = df_stroke.iloc[train_idx].copy(), df_stroke.iloc[test_idx].copy()
    clf_stroke_rf.fit(df_cv_train[X_stroke], df_cv_train['stroke'])
    scores_.append(f1_score(df_cv_test['stroke'], clf_stroke_rf.predict(df_cv_test[X_stroke])))
scores_, np.mean(scores_)

([0.06896551724137932,
  0.07407407407407407,
  0.0,
  0.10714285714285712,
  0.07843137254901959],
 0.06572276420146603)

In [19]:
cross_val_score(clf_stroke_rf, df_stroke[X_stroke], df_stroke['stroke'], cv=cv, scoring='f1')

array([0.06896552, 0.07407407, 0.        , 0.10714286, 0.07843137])

In [20]:
X_stroke = [i for i in df_stroke.columns if i != 'stroke']
ct_stroke_rf = ColumnTransformer([
    ('ohe', OneHotEncoder(), ['gender', 'ever_married', 'work_type', 'smoking_status', 'isna_bmi']), # 'Residence_type'은 제외
    ('pt', 'passthrough', ['age', 'avg_glucose_level', 'bmi']), # 처리 없이 통과
])
clf_stroke_rf = make_pipeline(
    ct_stroke_rf, 
    RandomForestClassifier(max_depth=11, min_samples_leaf=4, n_estimators=100, random_state=123)
)

# 샘플링 비율에 따른 5-fold 교차 검증에 대한 결과
for sr in [0.2, 0.5, 0.75, 1.0]:
    scores_ = []
    for train_idx, test_idx in cv.split(df_stroke[X_stroke], df_stroke['stroke']):
        df_cv_train, df_cv_test = df_stroke.iloc[train_idx].copy(), df_stroke.iloc[test_idx].copy()   
        X_sam, y_sam = RandomOverSampler(sampling_strategy=sr, random_state=123)\
                                    .fit_resample(df_cv_train[X_stroke], df_cv_train['stroke'])
        # 시험장 버젼에서는 DataFrame으로 반환을 하지 않으므로, 
        # ColumnTransformer는 pd.DataFrame을 입력으로 받으므로 DataFrame으로 바꿔줍니다.
        df_cv_train = pd.DataFrame(X_sam, columns=X_stroke)
        clf_stroke_rf.fit(df_cv_train, y_sam)
        scores_.append(f1_score(df_cv_test['stroke'], clf_stroke_rf.predict(df_cv_test[X_stroke])))
    print('sampling_strategy:{}, f1_score: {}'.format(sr, np.mean(scores_)))

sampling_strategy:0.2, f1_score: 0.16447836424362244
sampling_strategy:0.5, f1_score: 0.24080566418391686
sampling_strategy:0.75, f1_score: 0.25272092492412723
sampling_strategy:1.0, f1_score: 0.2608453420982487


In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

X_stroke = [i for i in df_stroke.columns if i != 'stroke']
ct_stroke_rf = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), 
                    ['gender', 'ever_married', 'work_type', 'smoking_status', 'isna_bmi']), # 'Residence_type'은 제외
    ('pt', 'passthrough', ['age', 'avg_glucose_level', 'bmi']), # 처리 없이 통과
])
clf_stroke_rf = make_pipeline(
    ct_stroke_rf, 
    RandomForestClassifier(max_depth=11, min_samples_leaf=4, n_estimators=100, random_state=123)
)

# 샘플링 비율에 따른 5-fold 교차 검증에 대한 결과
for sr in [0.2, 0.5, 0.75, 1.0]:
    scores_ = []
    for train_idx, test_idx in cv.split(df_stroke[X_stroke], df_stroke['stroke']):
        df_cv_train, df_cv_test = df_stroke.iloc[train_idx].copy(), df_stroke.iloc[test_idx].copy()   
        X_sam, y_sam = RandomUnderSampler(sampling_strategy=sr, random_state=123)\
                                    .fit_resample(df_cv_train[X_stroke], df_cv_train['stroke'])
        # 시험장 버젼에서는 DataFrame으로 반환을 하지 않으므로, 
        # ColumnTransformer는 pd.DataFrame을 입력으로 받으므로 DataFrame으로 바꿔줍니다.
        df_cv_train = pd.DataFrame(X_sam, columns=X_stroke)
        clf_stroke_rf.fit(df_cv_train, y_sam)
        scores_.append(f1_score(df_cv_test['stroke'], clf_stroke_rf.predict(df_cv_test[X_stroke])))
    print('sampling_strategy:{}, f1_score: {}'.format(sr, np.mean(scores_)))

sampling_strategy:0.2, f1_score: 0.25152876217045933
sampling_strategy:0.5, f1_score: 0.2846393966784234
sampling_strategy:0.75, f1_score: 0.2447974276644954
sampling_strategy:1.0, f1_score: 0.22366196099192157


## 다중 분류(Multi-calss Classification) : fetal health dataset

fetal health dataset

fetal_health : 대상 변수(1, 2, 3)

지표: Macro f1_score

$f_1score_i=\frac{2 \cdot Precision_i \cdot Recall_i}{Precision_i + Recall_i}$ . positive label을 i번째 범주로 했을 때 f1_score

Macro f1_score = $\frac{1}{N}\sum^{N}_{i=1}f_1score_i$

**다중 분류에서 Macro, Micro recall, precision, f1_score**


Macro는 분류 각각에 대한 score를 구한 다음 평균을 구합니다. 

Micro는 분류에 대한 소계를 내지 않고, 모든 입력에 대해 TP, FP, FN를 구하여 해당 지표를 냅니다.

* sklearn.metrics f1_score, precision_score, recall_score 사용시 공통 적용 사항

labels 는 positive_label 에 적용할 class의 label 값들 

Ex) 대상 클래스가 (1, 2, 3)일 때, labels=\[2, 3\]으로 하면, 

Macro는 \[2, 3\]에 대한 값만을 평균으로 

Micro는 \[2, 3\] 2, 3에 대한 TP, FP, FN을 구하여 지표를 계산합니다.

average: 집계 방법 

  'macro',  'micro', 'binary'

```python
from skearn.metrics import f1_score
# set(y) = {1, 2, 3}
f1_score(y, clf.predict(X), average='macro') # macro f1_score를 계산한다.
f1_score(y, clf.predict(X), labels=[1, 2], average='macro') # y가 1, 2에 대한 macro f1_score를 계산한다. 
```

In [22]:
df_fetal = pd.read_csv('data/fetal_health.csv')
df_fetal.head()

Unnamed: 0,baseline_value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120,0.0,0.0,0.0,0.0,0.0,0.0,73,0.5,43,...,62,126,2,0,120,137,121,73,1,2
1,132,0.006,0.0,0.006,0.003,0.0,0.0,17,2.1,0,...,68,198,6,1,141,136,140,12,0,1
2,133,0.003,0.0,0.008,0.003,0.0,0.0,16,2.1,0,...,68,198,5,1,141,135,138,13,0,1
3,134,0.003,0.0,0.008,0.003,0.0,0.0,16,2.4,0,...,53,170,11,0,137,134,137,13,1,1
4,132,0.007,0.0,0.008,0.0,0.0,0.0,16,2.4,0,...,53,170,9,0,137,136,138,11,1,1


In [23]:
df_fetal['fetal_health'].value_counts(), df_fetal['fetal_health'].value_counts(normalize=True)

(1    1655
 2     295
 3     176
 Name: fetal_health, dtype: int64,
 1    0.778457
 2    0.138758
 3    0.082785
 Name: fetal_health, dtype: float64)

In [24]:
df_fetal.isna().sum()

baseline_value                                            0
accelerations                                             0
fetal_movement                                            0
uterine_contractions                                      0
light_decelerations                                       0
severe_decelerations                                      0
prolongued_decelerations                                  0
abnormal_short_term_variability                           0
mean_value_of_short_term_variability                      0
percentage_of_time_with_abnormal_long_term_variability    0
mean_value_of_long_term_variability                       0
histogram_width                                           0
histogram_min                                             0
histogram_max                                             0
histogram_number_of_peaks                                 0
histogram_number_of_zeroes                                0
histogram_mode                          

In [25]:
cat_cols = df_fetal.apply(lambda x: x.nunique()).pipe(lambda x: x.loc[x < 10]).index.tolist()

In [26]:
df_fetal[cat_cols].apply(lambda x: x.value_counts()).unstack().dropna().to_frame()

Unnamed: 0,Unnamed: 1,0
severe_decelerations,0.0,2119.0
severe_decelerations,0.001,7.0
prolongued_decelerations,0.0,1948.0
prolongued_decelerations,0.001,70.0
prolongued_decelerations,0.002,72.0
prolongued_decelerations,0.003,24.0
prolongued_decelerations,0.004,9.0
prolongued_decelerations,0.005,3.0
histogram_number_of_zeroes,0.0,1624.0
histogram_number_of_zeroes,1.0,366.0


In [27]:
# 위 데이터에서 알 수 있는 건 fetal_health를 제외하고 histogram_tendency가 categorical이라는 것을 알 수 있다.
cat_cols = ['histogram_tendency']

In [28]:
cont_cols = [i for i in df_fetal.columns if i not in cat_cols and i != 'fetal_health']
cont_cols

['baseline_value',
 'accelerations',
 'fetal_movement',
 'uterine_contractions',
 'light_decelerations',
 'severe_decelerations',
 'prolongued_decelerations',
 'abnormal_short_term_variability',
 'mean_value_of_short_term_variability',
 'percentage_of_time_with_abnormal_long_term_variability',
 'mean_value_of_long_term_variability',
 'histogram_width',
 'histogram_min',
 'histogram_max',
 'histogram_number_of_peaks',
 'histogram_number_of_zeroes',
 'histogram_mode',
 'histogram_mean',
 'histogram_median',
 'histogram_variance']

In [29]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

# 5-fold cross validation
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
X_fetal = cat_cols + cont_cols
scores_ =[]

ct_fetal_rf = ColumnTransformer([
    ('ohe', OneHotEncoder(categories='auto'), cat_cols),
    ('pt', 'passthrough', cont_cols), # 처리 없이 통과
])
clf_fetal_rf = make_pipeline(ct_fetal_rf, 
                              RandomForestClassifier(max_depth=9, n_estimators=100, random_state=123))
scores_ = []
scores2_ = []
# 데이터 그대로, 교차 검증 수행
for train_idx, test_idx in cv.split(df_fetal[X_fetal], df_fetal['fetal_health']):
    df_cv_train, df_cv_test = df_fetal.iloc[train_idx].copy(), df_fetal.iloc[test_idx].copy()
    clf_fetal_rf.fit(df_cv_train[X_fetal], df_cv_train['fetal_health'])
    scores_.append(f1_score(df_cv_test['fetal_health'], clf_fetal_rf.predict(df_cv_test[X_fetal]), average='macro'))
     # minor class(2, 3)에 대한  macro f1_score를 구한다.
    scores2_.append(f1_score(df_cv_test['fetal_health'], clf_fetal_rf.predict(df_cv_test[X_fetal]), labels=[2, 3], average='macro'))
scores_, np.mean(scores_), scores2_, np.mean(scores2_)

([0.8966003316749585,
  0.8771241830065359,
  0.8583193856872401,
  0.884641287701731,
  0.8774154437894562],
 0.8788201263719844,
 [0.8583333333333334,
  0.8333333333333334,
  0.8074199943358822,
  0.8417987268048219,
  0.8340336134453782],
 0.8349838002505499)

In [30]:
# scoring: f1_score macro = f1_macro
cross_val_score(clf_fetal_rf, df_fetal[X_fetal], df_fetal['fetal_health'], cv=cv, scoring='f1_macro')
# scoring f1_score micro = f1_micro
# scoring recall macro = recall_macro

array([0.89660033, 0.87712418, 0.85831939, 0.88464129, 0.87741544])

In [31]:
from sklearn.metrics import make_scorer 
# sklearn.metrics.make_scorer로 scoring에 없는 지표를 만들 수 있다.
# labels=[2, 3], macro f1_score를 사용
cross_val_score(clf_fetal_rf, df_fetal[X_fetal], df_fetal['fetal_health'], cv=cv, 
                scoring=make_scorer(f1_score, labels=[2, 3], average='macro'))

array([0.85833333, 0.83333333, 0.80741999, 0.84179873, 0.83403361])

In [32]:
X_fetal = cat_cols + cont_cols

ct_fetal_rf = ColumnTransformer([
    ('ohe', OneHotEncoder(categories='auto'), cat_cols),
    ('pt', 'passthrough', cont_cols), # 처리 없이 통과
])
clf_fetal_rf = make_pipeline(ct_fetal_rf, 
                              RandomForestClassifier(max_depth=9, n_estimators=100, random_state=123))
sampling_strategies = [
    {2: 500, 3: 500}, {2: 700, 3: 700}, {2: 1000, 3:1000}, 'not majority'
]

for i in sampling_strategies:
    scores_ = []
    for train_idx, test_idx in cv.split(df_fetal[X_fetal], df_fetal['fetal_health']):
        df_cv_train, df_cv_test = df_fetal.iloc[train_idx].copy(), df_fetal.iloc[test_idx].copy()
        X_sam, y_sam = RandomOverSampler(sampling_strategy=i, random_state=123)\
                                        .fit_resample(df_cv_train[X_fetal], df_cv_train['fetal_health'])
        # ColumnTransformer는 pd.DataFrame을 입력으로 받으므로 DataFrame으로 바꿔준다.
        df_cv_train = pd.DataFrame(X_sam, columns=X_fetal)
        clf_fetal_rf.fit(df_cv_train[X_fetal], y_sam)
        scores_.append(f1_score(df_cv_test['fetal_health'], clf_fetal_rf.predict(df_cv_test[X_fetal]), average='macro'))
    print('sampling_strategy:{}, {}'.format(i, np.mean(scores_)))

sampling_strategy:{2: 500, 3: 500}, 0.8795168809919034
sampling_strategy:{2: 700, 3: 700}, 0.8869376337944944
sampling_strategy:{2: 1000, 3: 1000}, 0.8831369613386529
sampling_strategy:not majority, 0.8779914103102495


In [33]:
# 5-fold cross validation
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
X_fetal = cat_cols + cont_cols

ct_fetal_rf = ColumnTransformer([
    ('ohe', OneHotEncoder(categories='auto'), cat_cols),
    ('pt', 'passthrough', cont_cols), # 처리 없이 통과
])
clf_fetal_rf = make_pipeline(ct_fetal_rf, 
                              RandomForestClassifier(max_depth=9, n_estimators=100, random_state=123))
sampling_strategies = [
    {1: 500, 2: 200}, {1: 300, 2: 180}, 'not minority'
]

for i in sampling_strategies:
    scores_ = []
    for train_idx, test_idx in cv.split(df_fetal[X_fetal], df_fetal['fetal_health']):
        df_cv_train, df_cv_test = df_fetal.iloc[train_idx].copy(), df_fetal.iloc[test_idx].copy()
        X_sam, y_sam = RandomUnderSampler(sampling_strategy=i, random_state=123)\
                                        .fit_resample(df_cv_train[X_fetal], df_cv_train['fetal_health'])
        # 시험장 버젼에서는 DataFrame으로 반환을 하지 않으므로, 
        # ColumnTransformer는 pd.DataFrame을 입력으로 받으므로 DataFrame으로 바꿔줍니다.
        df_cv_train = pd.DataFrame(X_sam, columns=X_fetal)
        clf_fetal_rf.fit(df_cv_train[X_fetal], y_sam)
        scores_.append(f1_score(df_cv_test['fetal_health'], clf_fetal_rf.predict(df_cv_test[X_fetal]), average='macro'))
    print('sampling_strategy:{}, {}'.format(i, np.mean(scores_)))

sampling_strategy:{1: 500, 2: 200}, 0.8861614679590858
sampling_strategy:{1: 300, 2: 180}, 0.8667422821605804
sampling_strategy:not minority, 0.8365591126772973
