# Slope on ANN = The slope of the loss function with respect to the weight parameter

In [1]:
import numpy as np

In [2]:
class simpleNet:
    def __init__(self):
        self.W = np.random.randn(2,3)

    def predict(self, x):
        return np.dot(x, self.W)

    def softmax(self, x):
        if x.ndim == 2:
            x = x.T
            x = x - np.max(x, axis=0) # overflow
            y = np.exp(x) / np.sum(np.exp(x), axis=0)
            return y.T 
        x = x - np.max(x) # overflow
        return np.exp(x) / np.sum(np.exp(x))

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)
        
        # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
        if t.size == y.size:
            t = t.argmax(axis=1)
             
        batch_size = y.shape[0]
        print('batch_size =', batch_size)
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
    
    def loss(self, x, t):
        z = self.predict(x)
        y = self.softmax(z)
        print('y.shape =', y.shape, 'y.ndim =', y.ndim, 'y =', y)
        return self.cross_entropy_error(y, z)

In [3]:
net = simpleNet()

In [4]:
print(net.W.shape)
net.W

(2, 3)


array([[ 1.23984704,  0.34491688,  2.23902429],
       [-0.90642756, -0.04713224,  0.17343906]])

In [5]:
net.W = np.array([[0.47355232, 0.9977393, 0.84668094], [0.855557411, 0.03563661, 0.69422093]])

In [6]:
net.W

array([[0.47355232, 0.9977393 , 0.84668094],
       [0.85555741, 0.03563661, 0.69422093]])

In [7]:
x = np.array([0.6, 0.9])

In [8]:
p = net.predict(x)
p

array([1.05413306, 0.63071653, 1.1328074 ])

In [9]:
np.argmax(p)

2

In [10]:
t = np.array([0,0,1])

In [11]:
net.loss(x, t)

y.shape = (3,) y.ndim = 1 y = [0.36540923 0.23927209 0.39531868]
batch_size = 1


0.9280627939898086

# Get slope of the loss function with respect to the weight parameter

In [12]:
def f(W):
    return net.loss(x, t)

In [13]:
# def numerical_gradient(f, x):
#     h = 1e-4 # 0.0001
#     gradient = np.zeros_like(x)
#     for i in range(0, x.size):
#         tmp_val = x[i]
#         # f(x+h)
#         x[i] = float(tmp_val) + h
#         fx_right = f(x)
#         # f(x-h)
#         x[i] = float(tmp_val) - h
#         fx_left = f(x)
#         # gradient
#         gradient[i] = (fx_right - fx_left) / (2*h)
#         # 값 복원
#         x[i] = tmp_val
#     return gradient

In [14]:
def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    gradient = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        # f(x+h)
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) 
        # f(x-h)
        x[idx] = tmp_val - h 
        fxh2 = f(x) 
        gradient[idx] = (fxh1 - fxh2) / (2*h)
        # 값 복원
        x[idx] = tmp_val 
        it.iternext()      
    return gradient

In [15]:
dW = numerical_gradient(f, net.W)
dW

y.shape = (3,) y.ndim = 1 y = [0.36542314 0.23926684 0.39531002]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36539531 0.23927734 0.39532735]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36540398 0.23928301 0.39531301]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36541447 0.23926117 0.39532436]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36540056 0.23926642 0.39533303]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36541789 0.23927777 0.39530434]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.3654301  0.23926422 0.39530568]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36538836 0.23927996 0.39533168]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36540136 0.23928847 0.39531017]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36541709 0.23925571 0.3953272 ]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36539623 0.23926358 0.3953402 ]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36542223 0.2392806  0.39529717]
batch_size = 1


array([[ 0.21924548,  0.14356322, -0.3628087 ],
       [ 0.32886822,  0.21534483, -0.54421305]])

# Use lambda

In [16]:
f = lambda w: net.loss(x, t)
dW = numerical_gradient(f, net.W)
dW

y.shape = (3,) y.ndim = 1 y = [0.36542314 0.23926684 0.39531002]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36539531 0.23927734 0.39532735]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36540398 0.23928301 0.39531301]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36541447 0.23926117 0.39532436]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36540056 0.23926642 0.39533303]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36541789 0.23927777 0.39530434]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.3654301  0.23926422 0.39530568]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36538836 0.23927996 0.39533168]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36540136 0.23928847 0.39531017]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36541709 0.23925571 0.3953272 ]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36539623 0.23926358 0.3953402 ]
batch_size = 1
y.shape = (3,) y.ndim = 1 y = [0.36542223 0.2392806  0.39529717]
batch_size = 1


array([[ 0.21924548,  0.14356322, -0.3628087 ],
       [ 0.32886822,  0.21534483, -0.54421305]])