# 다층 퍼셉트론 Numpy로 직접 구현해보기

###### 1. Tensorflow 기반 분류 모델 예시코드

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

# Load data set
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocessing
x_train_norm, x_test_norm = x_train / 255.0, x_test / 255.0
x_train_reshaped = x_train_norm.reshape(-1, x_train_norm.shape[1]*x_train_norm.shape[2])
x_test_reshaped = x_test_norm.reshape(-1, x_test_norm.shape[1]*x_test_norm.shape[2])

# Model
model = keras.models.Sequential()
model.add(keras.layers.Dense(50, activation = 'sigmoid', input_shape = (784,))) # 입력층 d = 784, 은닉층 레이어 H = 50
model.add(keras.layers.Dense(10, activation = 'softmax')) # 출력층 레이어 K =10
model.summary()

# Model fit
model.compile(optimizer = 'adam',
             loss = 'sparse_categorical_crossentropy',
             metrics = ['accuracy'])
model.fit(x_train_reshaped, y_train, epochs = 10)

# Model result
test_loss, test_accuracy = model.evaluate(x_test_reshaped, y_test, verbose = 2)
print("test_loss: {} ".format(test_loss))
print("test_accuracy: {}".format(test_accuracy))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                39250     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 39,760
Trainable params: 39,760
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 0s - loss: 0.1043 - accuracy: 0.9680
test_loss: 0.10432688891887665 
test_accuracy: 0.9679999947547913


##### Parameters/Weights

* 입력층-은닉층, 은닉층-출력층 사이에는 사실 각각 행렬(Matrix)이 존재합니다. 예를 들어 입력값이 100개, 은닉 노드가 20개라면 사실 이 입력층-은닉층 사이에는 100x20의 형태를 가진 행렬이 존재합니다. 똑같이, MNIST 데이터처럼 10개의 클래스를 맞추는 문제를 풀기 위해 출력층이 10개의 노드를 가진다면 은닉층-출력층 사이에는 20x10의 형태를 가진 행렬이 존재하게 됩니다.

* Parameter 혹은 Weight라고 부릅니다. 두 단어는 보통 같은 뜻으로 사용되지만, 실제로 Paraemter에는 위의 참고자료에서 다룬 bias 노드도 포함된다는 점만 유의해 주세요. 이때 인접한 레이어 사이에는 아래와 같은 관계가 성립합니다.
  * y = W * X + b 

###  MLP 기반 딥러닝 모델을 Numpy로 구현해보기

##### 1) 입력층

In [2]:
# 입력층의 데이터 형태 (shape)
print(x_train_reshaped.shape)

X = x_train_reshaped[:5]
print(X.shape)
X

(60000, 784)
(5, 784)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### 2) Feed Forward

In [3]:
weight_init_std = 0.1
input_size = 784
hidden_size = 50

# 인접 레이어간 관계를 나타내는 파라미터 W를 생성하고 random 초기화
W1 = weight_init_std * np.random.randn(input_size, hidden_size)

# 바이어스 파라미터 b를 생성하고 Zero로 초기화
b1 = np.zeros(hidden_size)

a1 = np.dot(X, W1) +b1 # 은닉층 출력


print(W1.shape)
print(b1.shape)
print(a1.shape)
a1[0]

(784, 50)
(50,)
(5, 50)


array([ 0.2612843 , -0.4013273 , -1.03136202, -0.60855203,  1.29834405,
        0.02259547,  1.2016102 ,  0.60213712, -1.63256726, -1.39478777,
       -1.05270691,  0.35211582, -0.16017535,  0.42467962, -0.37930823,
       -1.42360352,  1.08810911, -0.8866059 , -0.5114341 ,  0.44231089,
        1.70795199,  1.02747354,  0.68021111, -0.62451391, -0.36528738,
        0.83843795,  1.08900454,  0.42731279,  0.99028257,  1.12324313,
        1.00023888, -0.31703171,  0.3334153 ,  0.07851691,  0.32927863,
        1.10676901,  0.06053865,  1.37631595, -0.09646048,  0.51913273,
        0.15853866,  0.09680852,  0.54801193,  0.67788114,  1.45275654,
       -0.12250306,  0.16199801,  0.43410442, -0.89390753, -0.96357116])

##### 2-1) Activation Function

In [4]:
# 첫번째 은닉층의 출력 a1에 sigmoid 적용

def sigmoid(x):
    return 1/(1+np.exp(-x))

z1 = sigmoid(a1)
print(z1[0])

[0.56495198 0.40099348 0.26282013 0.35238957 0.78555616 0.50564863
 0.76881111 0.64614509 0.16347898 0.19864452 0.25870564 0.58713057
 0.46004156 0.6046025  0.40629375 0.19409729 0.74802549 0.29181075
 0.3748574  0.60880953 0.84657046 0.7364258  0.66378581 0.34875553
 0.40968025 0.69813613 0.74819422 0.60523181 0.72914373 0.75458979
 0.73110554 0.42139931 0.58259014 0.51961915 0.58158385 0.75152626
 0.51513004 0.79839867 0.47590356 0.62694495 0.53955186 0.52418325
 0.63367422 0.66326563 0.81042231 0.46941248 0.54041116 0.60685334
 0.29030411 0.27616376]


##### 2-2) Dense layer

In [6]:
def affine_layer_forward(X, W, b):
    y = np.dot(X, W) + b
    cache = (X, W, b)
    return y, cache

In [8]:
input_size = 784
hidden_size = 50
output_size = 10

W1 = weight_init_std * np.random.randn(input_size, hidden_size)
b1 = np.zeros(hidden_size)
W2 = weight_init_std * np.random.randn(hidden_size, output_size)
b2 = np.zeros(output_size)

a1, cache1 = affine_layer_forward(X,W1,b1)
z1 = sigmoid(a1)
a2, cache2 = affine_layer_forward(z1, W2, b2)

print(a2[0])

[ 0.6293621  -0.29038705  0.22399015  0.1068836  -0.04822379  0.16365023
 -0.29753613  0.17944764 -0.37526025 -0.13568257]


##### 3) pred

*  softmax 함수란?

softmax 함수는 K개의 값이 존재할 때 각각의 값의 편차를 확대시켜 큰 값은 상대적으로 더 크게, 작은 값은 상대적으로 더 작게 만든 다음에 normalization 시키는 함수

In [10]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis = 0)
        y = np.exp(x) / np.sum(np.exp(x), axis = 0)
        return y.T
    x = x - np.max(x)
    return np.exp(x) / np.sum(np.exp(x))

In [11]:
y_hat = softmax(a2)
y_hat[0]

array([0.17671328, 0.07044127, 0.11781983, 0.1047996 , 0.08974233,
       0.11092082, 0.06993948, 0.112687  , 0.06470938, 0.08222701])

In [None]:
y_hat = softmax(a2)
y_hat[0]  # 10개의 숫자 중 하나일 확률이 되었습니다.

##### 4) Loss Functions

In [13]:
def _change_ont_hot_label(X, num_category):
    T = np.zeros((X.size, num_category))
    for idx, row in enumerate(T):
        row[X[idx]] = 1
    return T

Y_digit = y_train[:5]
t = _change_ont_hot_label(Y_digit, 10)
t

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [None]:
# 정답 라벨을 One-hot 인코딩하는 함수
def _change_ont_hot_label(X, num_category):
    T = np.zeros((X.size, num_category))
    for idx, row in enumerate(T):
        row[X[idx]] = 1
        
    return T

Y_digit = y_train[:5]
t = _change_ont_hot_label(Y_digit, 10)
t     # 정답 라벨의 One-hot 인코딩

In [15]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    
    if t.size == y.size:
        t = t.argmax(axis = 1)
        
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

Loss = cross_entropy_error(y_hat, t)
Loss

2.249290577065053

In [None]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

Loss = cross_entropy_error(y_hat, t)
Loss

##### 5) Gradient Descent

In [17]:
batch_num = y_hat.shape[0]
dy = (y_hat - t) / batch_num
dy # softmax값의 출력으로 Loss를 미분한 값

array([[ 0.03534266,  0.01408825,  0.02356397,  0.02095992,  0.01794847,
        -0.17781584,  0.0139879 ,  0.0225374 ,  0.01294188,  0.0164454 ],
       [-0.16667751,  0.01328617,  0.02351717,  0.01644038,  0.02011891,
         0.02448762,  0.01396723,  0.02391614,  0.01347546,  0.01746843],
       [ 0.04401926,  0.01273787,  0.02124276,  0.01561742, -0.18226752,
         0.02377224,  0.01527945,  0.01809542,  0.01501545,  0.01648765],
       [ 0.03309813, -0.18456459,  0.02089384,  0.01759884,  0.017801  ,
         0.02563827,  0.01488745,  0.02053569,  0.01749383,  0.01661754],
       [ 0.03524385,  0.01552865,  0.02427028,  0.01390858,  0.01408146,
         0.02615493,  0.01433308,  0.02065112,  0.01518332, -0.17935526]])

In [None]:
batch_num = y_hat.shape[0]
dy = (y_hat - t) / batch_num
dy    # softmax값의 출력으로 Loss를 미분한 값

In [19]:
dW2 = np.dot(z1.T, dy)
db2 = np.sum(dy, axis=0)

In [20]:
# 중간에 sigmoid가 한번 사용되었으므로, 활성화함수에 대한 gradient도 고려
def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

In [21]:
dz1 = np.dot(dy, W2.T)
da1 = sigmoid_grad(a1) *dz1
dW1 = np.dot(X.T, da1)
db1 = np.sum(dz1, axis = 0)

In [22]:
# learning_rate도 고려하여 파라미터 업데이트

learning_rate =  0.1
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    return W1, b1, W2, b2

##### 6) Backpropagation 

In [34]:
def affine_layer_backward(dy, cache):
    X, W, b = cache
    dX = np.dot(dy, W.T)
    dW = np.dot(X.T, dy)
    db = np.sum(dy, axis=0)
    return dX, dW, db

##### 7) train

In [35]:
W1 = weight_init_std * np.random.randn(input_size, hidden_size)
b1 = np.zeros(hidden_size)
W2 = weight_init_std * np.random.randn(hidden_size, output_size)
b2 = np.zeros(output_size)

def train_step(X, Y, W1, b1, W2, b2, learning_rate=0.1, verbose=False):
    a1, cache1 = affine_layer_forward(X, W1, b1)
    z1 = sigmoid(a1)
    a2, cache2 = affine_layer_forward(z1, W2, b2)
    y_hat = softmax(a2)
    t = _change_ont_hot_label(Y, 10)
    Loss = cross_entropy_error(y_hat, t)

    if verbose:
        print('---------')
        print(y_hat)
        print(t)
        print('Loss: ', Loss)
        
    dy = (y_hat - t) / X.shape[0]
    dz1, dW2, db2 = affine_layer_backward(dy, cache2)
    da1 = sigmoid_grad(a1) * dz1
    dX, dW1, db1 = affine_layer_backward(da1, cache1)
    
    W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
    
    return W1, b1, W2, b2, Loss

In [36]:
X = x_train_reshaped[:5]
Y = y_train[:5]

for i in range(5):
    W1, b1, W2, b2, _ = train_step(X, Y, W1, b1, W2, b2, learning_rate=0.1, verbose=True)

---------
[[0.09158278 0.16784034 0.09042252 0.13037681 0.09589499 0.04362908
  0.08620395 0.14247615 0.08663319 0.0649402 ]
 [0.09544838 0.15293797 0.08422071 0.13563836 0.1149616  0.03763848
  0.088985   0.13027061 0.08906494 0.07083394]
 [0.11183921 0.15103187 0.07935425 0.11958546 0.13906404 0.04820051
  0.08400501 0.1338072  0.08041449 0.05269795]
 [0.11127857 0.13201824 0.07651117 0.13362379 0.13483178 0.03892213
  0.08463594 0.1368634  0.0959157  0.05539928]
 [0.10897784 0.13451369 0.08334208 0.11906637 0.1323748  0.05041743
  0.08802842 0.12438213 0.09030912 0.06858814]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Loss:  2.4316945149755673
---------
[[0.10873859 0.18196954 0.08157167 0.10972236 0.10759103 0.05947351
  0.07749992 0.11718746 0.07671829 0.07952763]
 [0.11652402 0.16329105 0.07564833 0.11461112 0.12830453 0.04882243
  0.08045388 0.1071762  0.079

##### 8) predict

In [37]:
def predict(W1, b1, W2, b2, X):
    a1 = np.dot(X, W1) + b1
    z1 = sigmoid(a1)
    a2 = np.dot(z1, W2) + b2
    y = softmax(a2)

    return y

In [38]:
# X = x_train[:100] 에 대해 모델 추론을 시도합니다. 

X = x_train_reshaped[:100]
Y = y_test[:100]
result = predict(W1, b1, W2, b2, X)
result[0]

array([0.14858788, 0.19330781, 0.05235202, 0.0616807 , 0.12264183,
       0.13936364, 0.04927637, 0.06249381, 0.04698714, 0.1233088 ])

In [39]:
def accuracy(W1, b1, W2, b2, x, y):
    y_hat = predict(W1, b1, W2, b2, x)
    y_hat = np.argmax(y_hat, axis=1)
   # t = np.argmax(t, axis=1)

    accuracy = np.sum(y_hat == y) / float(x.shape[0])
    return accuracy

In [40]:
acc = accuracy(W1, b1, W2, b2, X, Y)

t = _change_ont_hot_label(Y, 10)
print(result[0])
print(t[0])
print(acc)

[0.14858788 0.19330781 0.05235202 0.0616807  0.12264183 0.13936364
 0.04927637 0.06249381 0.04698714 0.1233088 ]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
0.12


---

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

# MNIST 데이터를 로드. 다운로드하지 않았다면 다운로드까지 자동으로 진행됩니다. 
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()   

# 모델에 맞게 데이터 가공
x_train_norm, x_test_norm = x_train / 255.0, x_test / 255.0
x_train_reshaped = x_train_norm.reshape(-1, x_train_norm.shape[1]*x_train_norm.shape[2])
x_test_reshaped = x_test_norm.reshape(-1, x_test_norm.shape[1]*x_test_norm.shape[2])

# 딥러닝 모델 구성 - 2 Layer Perceptron
model=keras.models.Sequential()
model.add(keras.layers.Dense(50, activation='sigmoid', input_shape=(784,)))  # 입력층 d=784, 은닉층 레이어 H=50
model.add(keras.layers.Dense(10, activation='softmax'))   # 출력층 레이어 K=10
model.summary()

# 모델 구성과 학습
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])
model.fit(x_train_reshaped, y_train, epochs=10)

# 모델 테스트 결과
test_loss, test_accuracy = model.evaluate(x_test_reshaped,y_test, verbose=2)
print("test_loss: {} ".format(test_loss))
print("test_accuracy: {}".format(test_accuracy))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                39250     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 39,760
Trainable params: 39,760
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 0s - loss: 0.0977 - accuracy: 0.9692
test_loss: 0.09767191857099533 
test_accuracy: 0.9692000150680542


In [2]:
# 입력층 데이터의 모양(shape)
print(x_train_reshaped.shape)

# 테스트를 위해 x_train_reshaped의 앞 5개의 데이터를 가져온다.
X = x_train_reshaped[:5]
print(X.shape)

(60000, 784)
(5, 784)


In [3]:
weight_init_std = 0.1
input_size = 784
hidden_size=50

# 인접 레이어간 관계를 나타내는 파라미터 W를 생성하고 random 초기화
W1 = weight_init_std * np.random.randn(input_size, hidden_size)  
# 바이어스 파라미터 b를 생성하고 Zero로 초기화
b1 = np.zeros(hidden_size)

a1 = np.dot(X, W1) + b1   # 은닉층 출력

print(W1.shape)
print(b1.shape)
print(a1.shape)

(784, 50)
(50,)
(5, 50)


In [4]:
# 첫번째 데이터의 은닉층 출력을 확인해 봅시다.  50dim의 벡터가 나오나요?
a1[0]

array([ 0.26900943, -0.74499413,  0.76569976,  1.52396907,  0.77407353,
        0.5106225 ,  0.09994509,  0.19317798,  0.49943508,  0.69454337,
       -0.2193685 ,  0.2404075 ,  0.30131396,  0.26771715,  0.88435629,
        0.67215659, -0.0294649 ,  0.00544126, -0.04665228,  0.81957332,
        1.0683334 ,  1.47917226, -0.37593638,  0.96372063,  0.96274987,
       -0.834748  , -0.13030156, -0.13883221,  0.08527631, -0.15032689,
       -0.13430864,  0.50450818, -0.64520428, -1.549126  ,  0.0989828 ,
       -0.79408716,  2.55269759,  1.40927037,  1.021984  , -0.24234614,
       -1.22118648,  0.40248027, -0.68045444, -0.28597097,  0.31430273,
        0.92634547,  0.69817375,  0.94473074, -0.57103992,  0.47605183])

In [5]:
# sigmoid 함수 구현

def sigmoid(x):
    return 1/(1+np.exp(-x))

z1 = sigmoid(a1)
print(z1[0])

[0.5668497  0.32191303 0.68258993 0.8211222  0.68440142 0.62495239
 0.52496549 0.54814487 0.62232656 0.66697686 0.44537675 0.55981407
 0.57476369 0.56653238 0.70772414 0.66198589 0.49263431 0.50136031
 0.48833904 0.69414576 0.74427984 0.81444752 0.40710737 0.72386612
 0.72367204 0.30264207 0.46747062 0.46534759 0.52130617 0.46248889
 0.46647322 0.62351818 0.34407105 0.17521254 0.52472551 0.31129175
 0.92775453 0.80365084 0.73535888 0.43970826 0.22772772 0.59928343
 0.33615988 0.42899052 0.57793516 0.71633327 0.66778274 0.72005425
 0.3609969  0.61681514]


In [9]:
# 단일 레이어 구현 함수

def affine_layer_forward(X,W,b):
    y = np.dot(X,W) + b
    cache = (X,W,b)
    return y, cache

In [10]:
input_size = 784
hidden_size = 50
output_size = 10

W1 = weight_init_std * np.random.randn(input_size, hidden_size)
b1 = np.zeros(hidden_size)
W2 = weight_init_std * np.random.randn(hidden_size, output_size)
b2 = np.zeros(output_size)

a1, cache1 = affine_layer_forward(X, W1, b1)
z1 = sigmoid(a1)
a2, cache2 = affine_layer_forward(z1, W2, b2)    # z1이 다시 두번째 레이어의 입력이 됩니다. 

print(a2[0])  # 최종 출력이 output_size만큼의 벡터가 되었습니다.

[-0.07734357  0.02036746  0.03429832 -0.09785393 -0.33464087  0.37359658
  0.48617341  0.1699204   0.08121021 -0.68471173]


In [11]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # 오버플로 대책
    return np.exp(x) / np.sum(np.exp(x))

In [12]:
y_hat = softmax(a2)
y_hat[0]  # 10개의 숫자 중 하나일 확률이 되었습니다.

array([0.08851644, 0.09760214, 0.09897133, 0.08671943, 0.06843546,
       0.13895199, 0.15550927, 0.11334682, 0.10372489, 0.04822223])

In [13]:
# 정답 라벨을 One-hot 인코딩하는 함수
def _change_ont_hot_label(X, num_category):
    T = np.zeros((X.size, num_category))
    for idx, row in enumerate(T):
        row[X[idx]] = 1
        
    return T

Y_digit = y_train[:5]
t = _change_ont_hot_label(Y_digit, 10)
t     # 정답 라벨의 One-hot 인코딩

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [14]:
print(y_hat[0])
print(t[0])

[0.08851644 0.09760214 0.09897133 0.08671943 0.06843546 0.13895199
 0.15550927 0.11334682 0.10372489 0.04822223]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [15]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

Loss = cross_entropy_error(y_hat, t)
Loss

2.4199465184469715

In [16]:
batch_num = y_hat.shape[0]
dy = (y_hat - t) / batch_num
dy    # softmax값의 출력으로 Loss를 미분한 값

array([[ 0.01770329,  0.01952043,  0.01979427,  0.01734389,  0.01368709,
        -0.1722096 ,  0.03110185,  0.02266936,  0.02074498,  0.00964445],
       [-0.18351893,  0.01901633,  0.01621143,  0.01469751,  0.01483514,
         0.02658811,  0.03665958,  0.02123464,  0.0212283 ,  0.01304789],
       [ 0.01973813,  0.01816138,  0.01960985,  0.01663035, -0.1853926 ,
         0.0283451 ,  0.03105554,  0.02230375,  0.01859942,  0.01094909],
       [ 0.01306896, -0.17956437,  0.01962564,  0.01974692,  0.01393162,
         0.02832278,  0.02939372,  0.02172824,  0.02189062,  0.01185586],
       [ 0.01312517,  0.01865193,  0.01970109,  0.01827955,  0.01593988,
         0.0263097 ,  0.03234102,  0.02019484,  0.02244129, -0.18698447]])

In [17]:
dW2 = np.dot(z1.T, dy)    
dW2

array([[-0.08960477, -0.05408093,  0.0529602 ,  0.04836863, -0.06908652,
        -0.02187472,  0.09086445,  0.06065396,  0.0589838 , -0.07718409],
       [-0.09960359, -0.09009493,  0.06192372,  0.05701928, -0.01770513,
        -0.07549234,  0.1055538 ,  0.07088424,  0.06955594, -0.08204098],
       [-0.07054283, -0.06388465,  0.06342962,  0.05770504, -0.08669214,
        -0.07640982,  0.10664529,  0.07241279,  0.06963729, -0.07230059],
       [-0.00370311, -0.0480881 ,  0.02342693,  0.02173891, -0.04112274,
         0.02448275,  0.03816521,  0.02585368,  0.02564127, -0.0663948 ],
       [-0.00432232, -0.06806661,  0.04547689,  0.04130017, -0.12399126,
        -0.02342553,  0.07371025,  0.05131757,  0.04838205, -0.04038122],
       [-0.02853415, -0.03694057,  0.0300887 ,  0.02746236, -0.05062225,
        -0.01577773,  0.05032331,  0.03414255,  0.03295264, -0.04309487],
       [-0.09338662, -0.03827291,  0.05696451,  0.05170948, -0.05853396,
        -0.08111507,  0.09759104,  0.06553326

In [18]:
dW2 = np.dot(z1.T, dy)
db2 = np.sum(dy, axis=0)

In [19]:
def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

In [20]:
dz1 = np.dot(dy, W2.T)
da1 = sigmoid_grad(a1) * dz1
dW1 = np.dot(X.T, da1)
db1 = np.sum(dz1, axis=0)

In [21]:
learning_rate = 0.1

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    return W1, b1, W2, b2