In [1]:
import tensorflow as tf
import numpy as np
from IPython.display import Image

# 다층퍼셉트론 구조
텐서플로우로 아래의 다층퍼셉트론을 구현해보도록 하겠습니다.

In [2]:
Image(url= "https://raw.githubusercontent.com/minsuk-heo/deeplearning/master/img/dropout.png", width=500, height=250)

# MNIST 데이터 불러오기

In [3]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [4]:
print(x_train.shape)
print(x_test.shape)

(60000, 28, 28)
(10000, 28, 28)


# 학습데이터에서 검증 데이터 분리하기

In [5]:
x_val  = x_train[50000:60000]
x_train = x_train[0:50000]
y_val  = y_train[50000:60000]
y_train = y_train[0:50000]

In [6]:
print("train data has " + str(x_train.shape[0]) + " samples")
print("every train data is " + str(x_train.shape[1]) 
      + " * " + str(x_train.shape[2]) + " image")

train data has 50000 samples
every train data is 28 * 28 image


In [7]:
print("validation data has " + str(x_val.shape[0]) + " samples")
print("every train data is " + str(x_val.shape[1]) 
      + " * " + str(x_train.shape[2]) + " image")

validation data has 10000 samples
every train data is 28 * 28 image


**0** 부터 **255** 까지의 그레이 스케일을 확인할 수 있습니다.

In [8]:
# sample to show gray scale values
print(x_train[0][8])

[  0   0   0   0   0   0   0  18 219 253 253 253 253 253 198 182 247 241
   0   0   0   0   0   0   0   0   0   0]


**0** 부터 **9**까지의 이미지에 해당하는 숫자를 확인할 수 있습니다.

In [9]:
# sample to show labels for first train data to 10th train data
print(y_train[0:9])

[5 0 4 1 9 2 1 3 1]


테스트 데이터는 **10000** 개의 샘플을 가지고 있습니다.  
모든 테스트 데이터는 **28 * 28** 의 이미지입니다.  

In [10]:
print("test data has " + str(x_test.shape[0]) + " samples")
print("every test data is " + str(x_test.shape[1]) 
      + " * " + str(x_test.shape[2]) + " image")

test data has 10000 samples
every test data is 28 * 28 image


# 데이터 구조 변경하기
다층퍼셉트론의 입력 레이어에 데이터를 넣기 위해서 2d tensor (28, 28)인 데이터를,  
1d tensor (28*28, 1)의  형태로 바꿔줍니다.  
이 말은 행렬 형태의 데이터를 배열 형태의 데이터로 변경한다는 의미와 같습니다.

In [11]:
Image(url= "https://raw.githubusercontent.com/minsuk-heo/deeplearning/master/img/reshape_mnist.png", width=500, height=250)

In [12]:
x_train = x_train.reshape(50000, 784)
x_val = x_val.reshape(10000, 784)
x_test = x_test.reshape(10000, 784)

print(x_train.shape)
print(x_test.shape)

(50000, 784)
(10000, 784)


In [13]:
x_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  18,  18,  18,
       126, 136, 175,  26, 166, 255, 247, 127,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  30,  36,  94, 154, 17

# 데이터 정규화
데이터 정규화는 보통 학습 시간을 단축하고, 더 나은 성능을 구하도록 도와줍니다.  
MNIST 데이터의 모든 값은 0부터 255의 범위 안에 있으므로, 255로 값을 나눠줌으로써, 모든 값을 0부터 1 사이의 값으로 정규화합니다.  

In [14]:
x_train = x_train.astype('float32')
x_val = x_val.astype('float32')
x_test = x_test.astype('float32')

gray_scale = 255
x_train /= gray_scale
x_val /= gray_scale
x_test /= gray_scale

# 실제값을 one hot encoding으로 변경하기
손실 함수에서 크로스 엔트로피를 계산하기 위해, 실제값은 one hot encoding 값으로 변경합니다.

In [15]:
num_classes = 10
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_val = tf.keras.utils.to_categorical(y_val, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [16]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

# 텐서플로우 다층퍼셉트론 그래프 구현하기
텐서플로우로 아래의 다층퍼셉트론을 구현해보도록 하겠습니다.

In [17]:
Image(url= "https://raw.githubusercontent.com/minsuk-heo/deeplearning/master/img/dropout.png", width=500, height=250)

# 드랍 아웃 (Drop Out)

In [18]:
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
keep_prob = tf.placeholder(tf.float32)

아래 하든 레이어2(h2)에 드랍아웃을 적용합니다.
keep_prob의 값은 모델을 학습 또는 테스트할 때 결정합니다.

In [19]:
def mlp(x):
    # hidden layer1
    w1 = tf.Variable(tf.random_uniform([784,256]))
    b1 = tf.Variable(tf.zeros([256]))
    h1 = tf.nn.relu(tf.matmul(x, w1) + b1)
    # hidden layer2
    w2 = tf.Variable(tf.random_uniform([256,128]))
    b2 = tf.Variable(tf.zeros([128]))
    h2 = tf.nn.relu(tf.matmul(h1, w2) + b2)
    h2_drop = tf.nn.dropout(h2, keep_prob)
    # output layer
    w3 = tf.Variable(tf.random_uniform([128,10]))
    b3 = tf.Variable(tf.zeros([10]))
    logits= tf.matmul(h2_drop, w3) + b3
    
    return logits

In [20]:
logits = mlp(x)

In [21]:
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
    logits=logits, labels=y))

In [22]:
train_op = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss_op)

# 조기 종료 (Early Stopping)

매 주기(Epoch)마다 검증 데이터로 검증 정확도를 측정합니다.  
검증 정확도가 5번 연속으로 최고 검증 정확도보다 높지 않을 때 조기 종료를 수행합니다.

In [23]:
# initialize
init = tf.global_variables_initializer()

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# train hyperparameters
epoch_cnt = 300
batch_size = 1000
iteration = len(x_train) // batch_size

earlystop_threshold = 5
earlystop_cnt = 0

데이터를 모델에 입력시킬 때(feed), 드랍아웃이 있을 경우, 항상 keep_prob를 설정해주셔야합니다.  
학습 시, 10%의 드랍아웃을 하기 위해, keep_prob를 0.9로 설정합니다.  
테스트 시, 드랍 아웃을 사용하지 않을 것이므로, keep_prob를 1.0으로 설정합니다.

In [24]:
# Start training
with tf.Session() as sess:
    # Run the initializer
    sess.run(init)
    prev_train_acc = 0.0
    max_val_acc = 0.0
    
    for epoch in range(epoch_cnt):
        avg_loss = 0.
        start = 0; end = batch_size
        
        for i in range(iteration):
            _, loss = sess.run([train_op, loss_op], 
                               feed_dict={x: x_train[start: end], 
                                          y: y_train[start: end], 
                                          keep_prob: 0.9})
            start += batch_size; end += batch_size
            # Compute train average loss
            avg_loss += loss / iteration
            
        # Validate model
        preds = tf.nn.softmax(logits)  # Apply softmax to logits
        correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(y, 1))
        # Calculate accuracy
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        # train accuracy
        cur_train_acc = accuracy.eval({x: x_train, y: y_train,keep_prob: 1.0})
        # validation accuarcy
        cur_val_acc = accuracy.eval({x: x_val, y: y_val, keep_prob: 1.0})
        # validation loss
        cur_val_loss = loss_op.eval({x: x_val, y: y_val,keep_prob: 1.0})
        
        print("epoch: "+str(epoch)+
              ", train acc: " + str(cur_train_acc) +
              ", val acc: " + str(cur_val_acc) )
              #', train loss: '+str(avg_loss)+
              #', val loss: '+str(cur_val_loss))
        
        if cur_val_acc < max_val_acc:
            if cur_train_acc > prev_train_acc or cur_train_acc > 0.99:
                if earlystop_cnt == earlystop_threshold:
                    print("early stopped on "+str(epoch))
                    break
                else:
                    print("overfitting warning: "+str(earlystop_cnt))
                    earlystop_cnt += 1
            else:
                earlystop_cnt = 0
        else:
            earlystop_cnt = 0
            max_val_acc = cur_val_acc
            # Save the variables to file.
            save_path = saver.save(sess, "model/model.ckpt")
        prev_train_acc = cur_train_acc

epoch: 0, train acc: 0.21114, val acc: 0.2202
epoch: 1, train acc: 0.59308, val acc: 0.6089
epoch: 2, train acc: 0.63722, val acc: 0.6526
epoch: 3, train acc: 0.66614, val acc: 0.6833
epoch: 4, train acc: 0.70422, val acc: 0.7223
epoch: 5, train acc: 0.73572, val acc: 0.7481
epoch: 6, train acc: 0.76096, val acc: 0.771
epoch: 7, train acc: 0.7825, val acc: 0.7907
epoch: 8, train acc: 0.80076, val acc: 0.8099
epoch: 9, train acc: 0.81758, val acc: 0.8231
epoch: 10, train acc: 0.83212, val acc: 0.8361
epoch: 11, train acc: 0.8427, val acc: 0.8437
epoch: 12, train acc: 0.85278, val acc: 0.8522
epoch: 13, train acc: 0.86016, val acc: 0.8582
epoch: 14, train acc: 0.86798, val acc: 0.8673
epoch: 15, train acc: 0.87402, val acc: 0.8724
epoch: 16, train acc: 0.87958, val acc: 0.8777
epoch: 17, train acc: 0.88534, val acc: 0.8813
epoch: 18, train acc: 0.8883, val acc: 0.8849
epoch: 19, train acc: 0.8946, val acc: 0.8886
epoch: 20, train acc: 0.89924, val acc: 0.8916
epoch: 21, train acc: 0.9023

epoch: 158, train acc: 0.98872, val acc: 0.9532
epoch: 159, train acc: 0.9877, val acc: 0.9532
epoch: 160, train acc: 0.98838, val acc: 0.9534
epoch: 161, train acc: 0.987, val acc: 0.9527
epoch: 162, train acc: 0.9875, val acc: 0.9528
epoch: 163, train acc: 0.98684, val acc: 0.9532
epoch: 164, train acc: 0.98776, val acc: 0.9529
epoch: 165, train acc: 0.98572, val acc: 0.9514
epoch: 166, train acc: 0.98628, val acc: 0.9517
epoch: 167, train acc: 0.98726, val acc: 0.9531
epoch: 168, train acc: 0.99078, val acc: 0.9574
epoch: 169, train acc: 0.98982, val acc: 0.9567
epoch: 170, train acc: 0.9889, val acc: 0.9551
epoch: 171, train acc: 0.98782, val acc: 0.953
epoch: 172, train acc: 0.98888, val acc: 0.9546
epoch: 173, train acc: 0.98738, val acc: 0.9541
epoch: 174, train acc: 0.98866, val acc: 0.9555
epoch: 175, train acc: 0.98606, val acc: 0.9538
epoch: 176, train acc: 0.98924, val acc: 0.9557
epoch: 177, train acc: 0.9914, val acc: 0.9569
epoch: 178, train acc: 0.98852, val acc: 0.9542

# 테스트
검증 정확도가 가장 높은 모델을 대상으로 테스트를 진행합니다.

In [25]:
# Start testing
with tf.Session() as sess:
    # Restore variables from disk.
    saver.restore(sess, "model/model.ckpt")
    correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("[Test Accuracy] :", accuracy.eval({x: x_test, y: y_test, keep_prob: 1.0}))

INFO:tensorflow:Restoring parameters from model/model.ckpt
[Test Accuracy] : 0.957
