# 불균형 데이터셋 문제 해결 : SMOTE

In [1]:
import tensorflow as tf
import numpy as np
import random
from imblearn.over_sampling import BorderlineSMOTE

## 데이터셋 준비 (불균형한 데이터셋)

In [2]:
cifar10 = tf.keras.datasets.cifar10   # 32x32x3

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

x_train.shape   # 10개의 class 각 5,000 건

(50000, 32, 32, 3)

In [3]:
# 학습 데이터를 Imbalanced small dataset으로 변형하기
# 0 & 1 번 클래스 중 0번을 약 10%만 추출

x_train_small = list()
y_train_small = list()
for x, y in zip(x_train, y_train):
    if (y == 0 and random.randint(0, 100) < 10) or y == 1:
        x_train_small.append(x[:])
        y_train_small.append(y)
        
x_test_small = list()
y_test_small = list()
for x, y in zip(x_test, y_test):
    if y == 0 or y == 1:
        x_test_small.append(x[:])
        y_test_small.append(y)

print('y_train_small', len(x_train_small))
print('y_test_small ', len(y_test_small))
        
x_train = np.stack(x_train_small, axis=0)
y_train = np.stack(y_train_small, axis=0)

x_test = np.stack(x_test_small, axis=0)
y_test = np.stack(y_test_small, axis=0)

train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(32).prefetch(2048)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32).prefetch(2048)

y_train_small 5451
y_test_small  2000


## 모델 정의

In [4]:
class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, x, training=False, mask=None):
        x = self.flatten(x)
        x = self.dense1(x)
        return self.dense2(x)

## Keras API 모델 학습 (불균형한 데이터셋)

In [5]:
EPOCHS = 30

model = MyModel()

# 불균형 데이터에 대한 결과를 비교하기 위해 precision & recall 확인
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy',
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

model.fit(train_ds, validation_data=test_ds, epochs=EPOCHS, verbose=2)

Epoch 1/30
171/171 - 1s - loss: 0.3523 - accuracy: 0.9151 - precision: 0.9258 - recall: 0.9864 - val_loss: 1.4628 - val_accuracy: 0.5085 - val_precision: 0.5043 - val_recall: 0.9990
Epoch 2/30
171/171 - 1s - loss: 0.2232 - accuracy: 0.9272 - precision: 0.9366 - recall: 0.9874 - val_loss: 0.9559 - val_accuracy: 0.6215 - val_precision: 0.5703 - val_recall: 0.9860
Epoch 3/30
171/171 - 1s - loss: 0.2117 - accuracy: 0.9288 - precision: 0.9413 - recall: 0.9838 - val_loss: 0.8270 - val_accuracy: 0.6405 - val_precision: 0.5830 - val_recall: 0.9870
Epoch 4/30
171/171 - 1s - loss: 0.1977 - accuracy: 0.9329 - precision: 0.9440 - recall: 0.9852 - val_loss: 0.7119 - val_accuracy: 0.6830 - val_precision: 0.6144 - val_recall: 0.9830
Epoch 5/30
171/171 - 1s - loss: 0.1939 - accuracy: 0.9363 - precision: 0.9489 - recall: 0.9836 - val_loss: 1.0869 - val_accuracy: 0.5655 - val_precision: 0.5353 - val_recall: 0.9940
Epoch 6/30
171/171 - 1s - loss: 0.1836 - accuracy: 0.9378 - precision: 0.9476 - recall: 0.

<tensorflow.python.keras.callbacks.History at 0x299c8001608>

## 데이터셋 준비 (BorderlineSMOTE)

In [6]:
# BorderlineSMOTE 적용하기 - 데이터 2차원
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1] * x_train.shape[2] * x_train.shape[3])).astype(np.float32)
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1] * x_test.shape[2] * x_test.shape[3])).astype(np.float32)

smote = BorderlineSMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(32).prefetch(2048)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32).prefetch(2048)

## Keras API 모델 학습 (BorderlineSMOTE)

In [8]:
model = MyModel()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy',
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

model.fit(train_ds, validation_data=test_ds, epochs=EPOCHS, verbose=2)

Epoch 1/30
313/313 - 2s - loss: 0.6238 - accuracy: 0.7175 - precision: 0.7352 - recall: 0.6798 - val_loss: 0.5185 - val_accuracy: 0.7535 - val_precision: 0.6994 - val_recall: 0.8890
Epoch 2/30
313/313 - 2s - loss: 0.4171 - accuracy: 0.8214 - precision: 0.8626 - recall: 0.7646 - val_loss: 0.4390 - val_accuracy: 0.8055 - val_precision: 0.7858 - val_recall: 0.8400
Epoch 3/30
313/313 - 2s - loss: 0.3334 - accuracy: 0.8670 - precision: 0.9111 - recall: 0.8134 - val_loss: 0.4163 - val_accuracy: 0.8240 - val_precision: 0.8600 - val_recall: 0.7740
Epoch 4/30
313/313 - 2s - loss: 0.2813 - accuracy: 0.8964 - precision: 0.9346 - recall: 0.8524 - val_loss: 0.4274 - val_accuracy: 0.8220 - val_precision: 0.8634 - val_recall: 0.7650
Epoch 5/30
313/313 - 2s - loss: 0.2285 - accuracy: 0.9212 - precision: 0.9586 - recall: 0.8804 - val_loss: 0.4459 - val_accuracy: 0.8260 - val_precision: 0.7835 - val_recall: 0.9010
Epoch 6/30
313/313 - 2s - loss: 0.2036 - accuracy: 0.9286 - precision: 0.9591 - recall: 0.

<tensorflow.python.keras.callbacks.History at 0x299c2d7c188>