In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def sigmoid(z):
    a = 1/(1+np.exp(-z))
    return a

def relu(z):
    a = np.maximum(0, z)
    return a

def leaky_relu(z, aplha=0.01):
    a = np.maximum(aplha*z, z)
    return a

def tanh(z):
    a = (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))
    return a

def softmax(z):
    exp_z = np.exp(z)
    a = np.divide(exp_z, np.sum(exp_z))
    return a

def activate(z, activation):
    if activation == "sigmoid":
        return sigmoid(z)
    elif activation == "relu":
        return relu(z)
    elif activation == "leaky_relu":
        return leaky_relu(z)
    elif activation == "tanh":
        return tanh(z)
    else:
        return softmax(z)

In [3]:
class layer:
    def __init__(self, shape, activation):
        self.W = np.random.randn(*shape)*0.01
        self.b = np.zeros((shape[-1], 1))
        self.activation = activation
        
    def forward(self, x):
        self.z = np.dot(self.W, x) + self.b
        self.a = activate(self.z, self.activation)
        return self.a, self.w, self.z, x

In [34]:
def backward_propagation_sigmoid(dal, wl, zl, al_1):
    m = dal.shape[-1]
    al = sigmoid(zl)
    dzl = dal * al * (1 - al)
    dwl = 1/m*np.dot(dzl, al_1.T)
    dbl = 1/m*np.sum(dzl, axis=1, keepdims=True)
    dal_1 = np.dot(wl.T, dzl)
    return dal_1, dwl, dbl

def backward_propagation_relu(dal, wl, zl, al_1):
    m = dal.shape[-1]
    al = relu(zl)
    dal_dzl = zl >= 0
    dal_dzl = dal_dzl.astype("int")
    dzl = dal*dal_dzl
    dwl = 1/m*np.dot(dzl, al_1.T)
    dbl = 1/m*np.sum(dzl, axis=1, keepdims=True)
    dal_1 = np.dot(wl.T, dzl)
    return dal_1, dwl, dbl

def backward_propagation_tanh(dal, wl, zl, al_1):
    m = dal.shape[-1]
    al = tanh(zl)
    dzl = dal*(1-al**2)
    dwl = 1/m*np.dot(dzl, al_1.T)
    dbl = 1/m*np.sum(dzl, axis=1, keepdims=True)
    dal_1 = np.dot(wl.T, dzl)
    return dal_1, dwl, dbl

def backward_propagation(dal, wl, zl, al_1, act):
    if act == "sigmoid":
        return backward_propagation_sigmoid(dal, wl, zl, al_1)
    elif act == "relu":
        return backward_propagation_relu(dal, wl, zl, al_1)
    else:
        return backward_propagation_tanh(dal, wl, zl, al_1)

# c.shape = (1, m), z.shape = (nc, m), zc.shape = (1, m)
def select(z, c):
    zc = []
    for i, j in zip(c[0, :], range(c.shape[-1])):
        zc.append(z[i, j])
    zc = np.array(zc).reshape(1, len(zc))
    return zc

def grads_correction(dal_dzl, temp, c):
    for idx, i in zip(c[0, :], range(dal_dzl.shape[-1])):
        dal_dzl[idx, i] = temp[0, i]
    return dal_dzl

def backward_propagation_softmax(dal, wl, zl, al_1, c):
    m = dal.shape[-1]
    zc = select(zl, c) # zc.shape = (1, m)
    denominator = np.sum(np.exp(zl))**2
    dal_dzl = (np.sum(np.exp(zl)) - np.exp(zc)*np.exp(zl)) / denominator # dal_dzl.shape = (nc, m)
    temp = (np.exp(zc)*np.sum(np.exp(zl)) - np.square(np.exp(zc))) / denominator # temp.shape = (1, m)
    # dal_dzl needs correction on c indices
    dal_dzl = grads_correction(dal_dzl, temp, c)
    dzl = dal*dal_dzl
    dwl = 1/m*np.dot(dzl, al_1.T)
    dbl = 1/m*np.sum(dzl, axis=1, keepdims=True)
    dal_1 = np.dot(wl.T, dzl)
    return dal_1, dwl, dbl

In [5]:
class model:
    def __init__(self):
        self.layers = []
    
    def forward_prop(self, x):
        cache = {}
        al = x
        for layer, l in zip(self.layers, range(1, len(self.layers)+1)):
            al, wl, zl, al_1 = layer.forward(al)
            cache["l"+str(l)] = [wl, zl, al_1]
        return al, cache
    
    def backward_prop(self, y_hat, y, cache):
        dal = np.divide(y, y_hat)
        c = []
        for i in range(y.shape[-1]):
            c.append(np.nonzero(y[:, i])[0])
        c = np.array(c).T
        ## to be continued

In [6]:
y_hat = np.array([0,0,0,1,0]).reshape(1, 5)

In [7]:
idx = np.array([2,2,1,0])

zl = np.random.randn(3, 4)

zl

array([[ 0.16656736, -0.04305739,  0.72958959, -0.55255352],
       [ 0.17038562,  1.7123673 , -0.5390334 ,  0.82161837],
       [ 0.19062715,  0.41407506, -0.24065972, -1.1616821 ]])

In [8]:
for i in range(len(idx)):
    print(i, idx[i], zl[idx[i], i])

0 2 0.19062715025940352
1 2 0.4140750614492724
2 1 -0.5390333957279909
3 0 -0.5525535249399043


In [9]:
np.array([zl[idx[i], i] for i in range(len(idx))])

array([ 0.19062715,  0.41407506, -0.5390334 , -0.55255352])

In [11]:
idx

array([2, 2, 1, 0])

In [12]:
y = np.array([[0,0,1], [0,1,0], [1,0,0], [0,1,0], [1,0,0]]).T
y.shape

(3, 5)

In [13]:
y

array([[0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0]])

In [14]:
c = []
for i in range(y.shape[-1]):
    c.append(np.nonzero(y[:, i])[0])

c = np.array(c).T
print(c, c.shape)

[[2 1 0 1 0]] (1, 5)


In [15]:
np.random.seed(1)
z = np.random.randn(3, 5)
z

array([[ 1.62434536, -0.61175641, -0.52817175, -1.07296862,  0.86540763],
       [-2.3015387 ,  1.74481176, -0.7612069 ,  0.3190391 , -0.24937038],
       [ 1.46210794, -2.06014071, -0.3224172 , -0.38405435,  1.13376944]])

In [16]:
zc = []
for i, j in zip(c[0, :], range(c.shape[-1])):
    zc.append(z[i, j])
np.array(zc).reshape(1, len(zc))

array([[ 1.46210794,  1.74481176, -0.52817175,  0.3190391 ,  0.86540763]])

In [17]:
select(z, c)

array([[ 1.46210794,  1.74481176, -0.52817175,  0.3190391 ,  0.86540763]])

In [19]:
c.shape

(1, 5)

In [20]:
c

array([[2, 1, 0, 1, 0]])

In [27]:
np.random.seed(1)
nc = 3
m = 5
dal_dzl = np.random.randn(nc, m)
temp = np.random.randn(1, m)
print(dal_dzl)
print()
print(temp)

[[ 1.62434536 -0.61175641 -0.52817175 -1.07296862  0.86540763]
 [-2.3015387   1.74481176 -0.7612069   0.3190391  -0.24937038]
 [ 1.46210794 -2.06014071 -0.3224172  -0.38405435  1.13376944]]

[[-1.09989127 -0.17242821 -0.87785842  0.04221375  0.58281521]]


In [31]:
for idx, i in zip(c[0, :], range(dal_dzl.shape[-1])):
    dal_dzl[idx, i] = temp[0, i]

In [32]:
dal_dzl

array([[ 1.62434536, -0.61175641, -0.87785842, -1.07296862,  0.58281521],
       [-2.3015387 , -0.17242821, -0.7612069 ,  0.04221375, -0.24937038],
       [-1.09989127, -2.06014071, -0.3224172 , -0.38405435,  1.13376944]])

In [37]:
np.random.seed(1)
dal = np.random.randn(3, 5)
wl = np.random.randn(3, 4)
zl = np.random.randn(3, 5)
al_1 = np.random.randn(4, 5)
backward_propagation_softmax(dal, wl, zl, al_1, c)

(array([[-0.13006214,  0.01142872,  0.02034787,  0.06813306, -0.01185977],
        [ 0.22244743, -0.13840802,  0.04234103, -0.02857301,  0.07081474],
        [-0.30546   ,  0.15495062,  0.00329373,  0.09027558, -0.09215881],
        [-0.12849411,  0.05115873, -0.03924748,  0.01610497, -0.02014042]]),
 array([[-0.01758276,  0.02680404, -0.01879969,  0.02484141],
        [ 0.03143987, -0.05803909,  0.02873865, -0.03631824],
        [-0.04633885,  0.0326115 , -0.00667118,  0.03039386]]),
 array([[-0.00375252],
        [-0.02588729],
        [ 0.00508998]]))

In [36]:
c

array([[2, 1, 0, 1, 0]])