In [1]:
import os, random
import keras
from keras.datasets import fashion_mnist
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.utils import np_utils
import numpy as np
import tensorflow as tf
os.path.expanduser = lambda path: './'

In [2]:
batch_size = 128
num_classes = 10
epochs = 60

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.np_utils.to_categorical(y_train, num_classes)
y_test = keras.utils.np_utils.to_categorical(y_test, num_classes)

60000 train samples
10000 test samples


# 1. Define Model(Dropout rate 0.2)

In [3]:
# for reproducibility
import random, os
os.environ['PYTHONHASHSEED']='0'
random.seed(123)
np.random.seed(123)
tf.random.set_seed(123)
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,    inter_op_parallelism_threads=1,
                                                               allow_soft_placement=True, device_count = {'CPU': 1}))
from tensorflow.python.keras import backend as K
K.set_session(sess)



kernel_initializer='glorot_uniform'
activation_function = 'relu'

with tf.device('/cpu:0'):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               401920    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 10)                5130      
                                                                 
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________


# Start Training(Dropout rate 0.2)

In [4]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60


Epoch 58/60
Epoch 59/60
Epoch 60/60


# Calculate accuracy (Dropout rate 0.2)

In [5]:
metrics = model.evaluate(x_test, y_test) #returns loss and accuracy
print(metrics[1])
print(f'Accuracy: {metrics[1]*100:.2f}%\n')

0.8761000037193298
Accuracy: 87.61%



# 2. Define Model(Dropout rate 0.5)

In [6]:
kernel_initializer='glorot_uniform'
activation_function = 'relu'

with tf.device('/cpu:0'):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.5))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 512)               401920    
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 dense_4 (Dense)             (None, 512)               262656    
                                                                 
 dropout_3 (Dropout)         (None, 512)               0         
                                                                 
 dense_5 (Dense)             (None, 10)                5130      
                                                                 
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________


# Start Training(Dropout rate 0.5)

In [7]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60


Epoch 58/60
Epoch 59/60
Epoch 60/60


# Calculate accuracy (Dropout rate 0.5)

In [8]:
metrics = model.evaluate(x_test, y_test) #returns loss and accuracy
print(metrics[1])
print(f'Accuracy: {metrics[1]*100:.2f}%\n')

0.8705000281333923
Accuracy: 87.05%



# 3. Define Model(Dropout rate 0.8)

In [9]:
kernel_initializer='glorot_uniform'
activation_function = 'relu'

with tf.device('/cpu:0'):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.8))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.8))
    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 512)               401920    
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_7 (Dense)             (None, 512)               262656    
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_8 (Dense)             (None, 10)                5130      
                                                                 
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________


# Start Training(Dropout rate 0.8)

In [10]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60


Epoch 58/60
Epoch 59/60
Epoch 60/60


# Calculate accuracy (Dropout rate 0.8)

In [11]:
metrics = model.evaluate(x_test, y_test) #returns loss and accuracy
print(metrics[1])
print(f'Accuracy: {metrics[1]*100:.2f}%\n')

0.8513000011444092
Accuracy: 85.13%



# Dropout rate의 변화와 그에 따른 Accuracy 비교

위의 세 경우에 대해, Dropout rate가 각각 0.2,0.5,0.8 일때, 87.61%, 87.05%, 85.13%로 점점 감소한다. 이렇게 변하는 이유는 무엇일까? 기본적으로 Dropout은 DNN에서 각 layer의 노드들이 각각 맡고 있는 입력들에 대해 더욱 잘 분석하도록 하기 위해 사용하는데, 뉴런이 다른 뉴런에 의존하는 co-adaptation문제를 해결하고 overfitting 문제를 개선하는 효과를 가지고 있다. 하지만 이 dropout은 학습 시 임의로 쳐내는 그 비율에 따라 성능이 크게 달라질 수 있는데, 위 3가지 케이스의 정확도를 보면 이게 잘 드러난다. 
0.2의 dropout rate을 적용한 것의 train 결과를 보면 validation loss가 epoch을 거듭할수록 감소하며, 다시 증가하지는 않는다. 이는 overfitting 문제가 그렇게 드러나지 않은 결과이고, 다시 말하면 10개 중 2개의 노드만 0으로 설정하여 학습시켰는데도 이미 원하는 모델에 가까워졌음을 뜻한다. 여기서 더 많은 입력노드를 쳐내버린 0.5와 0.8은, 물론 그 정확도가 급격하게 떨어질 정도는 아니지만, epoch을 60회나 수행했음에도 학습 과정에서 너무 많은 입력 노드를 무시한 결과 충분한 학습이 되지 못해 정확도가 떨어지는 양상을 보인다. 즉 무조건 많이 dropout을 진행하는 것이 아니라 적절한 비율을 찾아 그를 적용해야 올바른 학습이 진행되고, 이런 결과는 batch normalization을 진행한 아래 케이스들에서도 똑같은 양상으로 나타난다. 

# 4. Define Model(Dropout rate 0.2 + Batch Normalization)

In [12]:
kernel_initializer='glorot_uniform'
activation_function = 'relu'

with tf.device('/cpu:0'):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 512)               401920    
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 batch_normalization (BatchN  (None, 512)              2048      
 ormalization)                                                   
                                                                 
 dense_10 (Dense)            (None, 512)               262656    
                                                                 
 dropout_7 (Dropout)         (None, 512)               0         
                                                                 
 batch_normalization_1 (Batc  (None, 512)              2048      
 hNormalization)                                      

# Start Training(Dropout rate 0.2 + Batch Normalization)

In [13]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60


Epoch 58/60
Epoch 59/60
Epoch 60/60


# Calculate accuracy (Dropout rate 0.2 + Batch Normalization)

In [14]:
metrics = model.evaluate(x_test, y_test) #returns loss and accuracy
print(metrics[1])
print(f'Accuracy: {metrics[1]*100:.2f}%\n')

0.8899999856948853
Accuracy: 89.00%



# 5. Define Model(Dropout rate 0.5 + Batch Normalization)

In [15]:
kernel_initializer='glorot_uniform'
activation_function = 'relu'

with tf.device('/cpu:0'):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 512)               401920    
                                                                 
 dropout_8 (Dropout)         (None, 512)               0         
                                                                 
 batch_normalization_2 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dense_13 (Dense)            (None, 512)               262656    
                                                                 
 dropout_9 (Dropout)         (None, 512)               0         
                                                                 
 batch_normalization_3 (Batc  (None, 512)              2048      
 hNormalization)                                      

# Start Training(Dropout rate 0.5 + Batch Normalization)

In [16]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60


Epoch 58/60
Epoch 59/60
Epoch 60/60


# Calculate accuracy (Dropout rate 0.5 + Batch Normalization)

In [17]:
metrics = model.evaluate(x_test, y_test) #returns loss and accuracy
print(metrics[1])
print(f'Accuracy: {metrics[1]*100:.2f}%\n')

0.8830000162124634
Accuracy: 88.30%



# 6. Define Model(Dropout rate 0.8 + Batch Normalization)

In [18]:
kernel_initializer='glorot_uniform'
activation_function = 'relu'

with tf.device('/cpu:0'):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.8))
    model.add(BatchNormalization())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.8))
    model.add(BatchNormalization())
    model.add(Dense(num_classes, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 512)               401920    
                                                                 
 dropout_10 (Dropout)        (None, 512)               0         
                                                                 
 batch_normalization_4 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dense_16 (Dense)            (None, 512)               262656    
                                                                 
 dropout_11 (Dropout)        (None, 512)               0         
                                                                 
 batch_normalization_5 (Batc  (None, 512)              2048      
 hNormalization)                                      

# Start Training(Dropout rate 0.8 + Batch Normalization)

In [19]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60


Epoch 58/60
Epoch 59/60
Epoch 60/60


# Calculate accuracy (Dropout rate 0.8 + Batch Normalization)

In [20]:
metrics = model.evaluate(x_test, y_test) #returns loss and accuracy
print(metrics[1])
print(f'Accuracy: {metrics[1]*100:.2f}%\n')

0.8496000170707703
Accuracy: 84.96%



# Batch Normalization을 Dropout과 함께 적용한 모델 vs 
# Dropout만 진행했을 때의 정확도 비교


위에서 확인할 수 있다시피 0.8의 dropout rate일때는 비슷한 정확도였지만, 0.2 와 0.5 의 dropout rate가 적용되었을 때는 확연히 batch normalization을 함께 진행했을 때가 정확도가 높았다. 1~3번 케이스에서 살펴본 바를 적용하면 적절한 dropout rate라고 할 수 있는 0.2의 경우를 볼 때 결국 Batch Normalization이 DNN 학습에 긍정적인 영향을 주었다고 볼 수 있을 것이다.(사실 batch normalization은 그 자체로 dropout의 필요성을 감소시키는 영향도 존재한다.)
그 이유는 무엇인가? Batch Normalization이라는 것은, 앞에서 vanishing gradient같은 문제를 해결하기 위해 activation 함수를 조절하는 간접적인 방법을 사용한 것과 더불어,학습 과정 자체에 그 input data를 적절히 조절해 학습을 원활하게 만드는 방법이다. 학습 과정에서 dimension별로 정규화한 결과 학습 속도가 빨라지고 초기값에 의존하는 정도를 감소시키는 역할을 한다. 무엇보다 gradient flow가 batch normalization을 적용함으로써 더 개선되어 학습 성능이 향상되고, 그 결과가 나타난 것이라고 보면 된다.