### https://www.reddit.com/r/cs231n/comments/443y2g/hints_for_a2/

In [1]:
# As usual, a bit of setup
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.cnn import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient_array, eval_numerical_gradient
from cs231n.layers import *
from cs231n.fast_layers import *
from cs231n.solver import Solver

%matplotlib inline
plt.rcParams['figure.figsize'] = (12.0, 9.6) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [2]:
# Numeric gradient checking for intermediate results

# When you are trying to implement a tricky backprop,
# it can sometimes be tough to get everything correct all at once.
# A useful strategy for debugging is to numerically gradient check your intermediates as well.
# As an example, suppose we have the function

def f_forward(a, b, c):
  out = a / b + b / c
  cache = (a, b, c)
  return out, cache

# and we want to implement the corresponding backward pass.
# First you can rewrite this function to compute its output in terms of intermediates:

def f_forward(a, b, c):
  t1 = a / b
  t2 = b / c
  out = t1 + t2
  cache = (a, b, c)
  return out, cache

# Now in the backward pass you will receive dout, use it to compute dt1
# and dt2, and in turn use those to compute da, db, and dc. You can debug
# your backprop logic step by step by defining partial functions.
# For example, first define a function that computes t1 from a, b, and c:

def f_forward_partial(a, b, c):
  t1 = a / b
  t2 = b / c
  out = t1 + t2
  cache = a, b, c
  return t2, cache

# Notice that we return t1 rather than out.
# You can then define a "backward" version of this partial
# function that computes da, db, and dc from dt1:

def f_backward_partialt1(dt1, cache):
  a, b, c = cache
  da = dt1 / b
  db = -dt1 * a / b / b
  dc = np.zeros_like(da)
  return da, db, dc

def f_backward_partialt2(dt2, cache):
  a, b, c = cache
  da = np.zeros_like(a)
  db = dt2 / c
  dc = dt2 * (-b / c**2)
  return da, db, dc

def f_backward(dout, cache):
  a, b, c = cache
  dt1 = dout
  dt2 = dout
  da = dt1 / b
  db = (dt2 / c) + (-dt1 * a / b**2)
  dc = dt2 * (-b / c**2)
  return da, db, dc

# If these partial functions pass a numeric gradient check,
# then you know how to compute da, db, and dc from dt1.
# You can repeat the same exercise to compute da, db, and dc from dt2;
# then you just need to figure out how to compute dt1 and dt2 from dout.

# This is a simple example that you probably
# don't need this technique for, but it should help
# you see how you can use partial functions to help
# you pinpoint bugs in complex backward passes.
                    
# Ми рахуємо градієнти починаючи не з dout а з dt2 або dt1 

In [4]:
np.random.seed(231)
N = 10
a = np.random.randn(N)
b = np.random.randn(N)
c = np.random.randn(N)
dout = np.random.randn(N)

fa = lambda x: f_forward(a, b, c)[0]
fb = lambda x: f_forward(a, b, c)[0]
fc = lambda x: f_forward(a, b, c)[0]

da_num = eval_numerical_gradient_array(fa, a, dout)
db_num = eval_numerical_gradient_array(fb, b, dout)
dc_num = eval_numerical_gradient_array(fc, c, dout)

_, cache = f_forward(a, b, c)
da, db, dc = f_backward(dout, cache)

print('da error: ', rel_error(da_num, da))
print('db error: ', rel_error(db_num, db))
print('dc error: ', rel_error(dc_num, dc))

da error:  9.939960877105371e-12
db error:  3.1029443312963303e-09
dc error:  4.047663498413915e-08


In [5]:
# IT WORKS!!!!!

In [12]:
def f_forward2(a, b, c):
    t1 = a / b
    t2 = b / c
    m1 = t1 * 3
    m2 = t2 * 5
    out = m1 + m2
    cache = (a, b, c)
    return out, cache

def f_forward2_partialt1(a, b, c):
    t1 = a / b
    t2 = b / c
    m1 = t1 * 3
    m2 = t2 * 5
    # out = m1 + m2
    out = t1
    cache = (a, b, c)
    return out, cache

def f_backward2_partialt1(dt1, cache):
    a, b, c = cache
    da = dt1 / b
    db = -dt1 * a / b / b
    dc = np.zeros_like(da)
    return da, db, dc

In [13]:
np.random.seed(231)
N = 10
a = np.random.randn(N)
b = np.random.randn(N)
c = np.random.randn(N)
dout = np.random.randn(N)

fa = lambda x: f_forward2_partialt1(a, b, c)[0]
fb = lambda x: f_forward2_partialt1(a, b, c)[0]
fc = lambda x: f_forward2_partialt1(a, b, c)[0]

da_num = eval_numerical_gradient_array(fa, a, dout)
db_num = eval_numerical_gradient_array(fb, b, dout)
dc_num = eval_numerical_gradient_array(fc, c, dout)

_, cache = f_forward2_partialt1(a, b, c)
da, db, dc = f_backward2_partialt1(dout, cache)

print('da error: ', rel_error(da_num, da))
print('db error: ', rel_error(db_num, db))
print('dc error: ', rel_error(dc_num, dc))

da error:  9.939960877105371e-12
db error:  3.228362380366263e-09
dc error:  0.0


In [17]:
def f_forward2_partialt2(a, b, c):
    t1 = a / b
    t2 = b / c
    m1 = t1 * 3
    m2 = t2 * 5
    # out = m1 + m2
    out = t2
    cache = (a, b, c)
    return out, cache


def f_backward2_partialt2(dt2, cache):
    a, b, c = cache
    da = np.zeros_like(a)
    db = dt2 / c
    dc = -dt2 * b / c / c
    return da, db, dc

In [18]:
np.random.seed(231)
N = 10
a = np.random.randn(N)
b = np.random.randn(N)
c = np.random.randn(N)
dout = np.random.randn(N)

fa = lambda x: f_forward2_partialt2(a, b, c)[0]
fb = lambda x: f_forward2_partialt2(a, b, c)[0]
fc = lambda x: f_forward2_partialt2(a, b, c)[0]

da_num = eval_numerical_gradient_array(fa, a, dout)
db_num = eval_numerical_gradient_array(fb, b, dout)
dc_num = eval_numerical_gradient_array(fc, c, dout)

_, cache = f_forward2_partialt2(a, b, c)
da, db, dc = f_backward2_partialt2(dout, cache)

print('da error: ', rel_error(da_num, da))
print('db error: ', rel_error(db_num, db))
print('dc error: ', rel_error(dc_num, dc))

da error:  0.0
db error:  5.41936183948334e-12
dc error:  4.047663498413915e-08


In [21]:
def f_forward2_partialt3(a, b, c):
    t1 = a / b
    t2 = b / c
    m1 = t1 * 3
    m2 = t2 * 5
    # out = m1 + m2
    out = m2
    cache = (a, b, c)
    return out, cache


def f_backward2_partialt3(dm2, cache):
    a, b, c = cache
    dt2 = dm2 * 5
    da = np.zeros_like(a)
    db = dt2 / c
    dc = -dt2 * b / c / c
    return da, db, dc

In [22]:
np.random.seed(231)
N = 10
a = np.random.randn(N)
b = np.random.randn(N)
c = np.random.randn(N)
dout = np.random.randn(N)

fa = lambda x: f_forward2_partialt3(a, b, c)[0]
fb = lambda x: f_forward2_partialt3(a, b, c)[0]
fc = lambda x: f_forward2_partialt3(a, b, c)[0]

da_num = eval_numerical_gradient_array(fa, a, dout)
db_num = eval_numerical_gradient_array(fb, b, dout)
dc_num = eval_numerical_gradient_array(fc, c, dout)

_, cache = f_forward2_partialt3(a, b, c)
da, db, dc = f_backward2_partialt3(dout, cache)

print('da error: ', rel_error(da_num, da))
print('db error: ', rel_error(db_num, db))
print('dc error: ', rel_error(dc_num, dc))

da error:  0.0
db error:  6.449223167160532e-12
dc error:  4.0476731262406694e-08


In [27]:
def f_forward2_partialt4(a, b, c):
    t1 = a / b
    t2 = b / c
    m1 = t1 * 3
    m2 = t2 * 5
    # out = m1 + m2
    out = m1
    cache = (a, b, c)
    return out, cache


def f_backward2_partialt4(dm1, cache):
    a, b, c = cache
    dt1 = dm1 * 3
    da = dt1 / b
    db = -dt1 * a / b / b
    dc = np.zeros_like(da)
    return da, db, dc

In [28]:
np.random.seed(231)
N = 10
a = np.random.randn(N)
b = np.random.randn(N)
c = np.random.randn(N)
dout = np.random.randn(N)

fa = lambda x: f_forward2_partialt4(a, b, c)[0]
fb = lambda x: f_forward2_partialt4(a, b, c)[0]
fc = lambda x: f_forward2_partialt4(a, b, c)[0]

da_num = eval_numerical_gradient_array(fa, a, dout)
db_num = eval_numerical_gradient_array(fb, b, dout)
dc_num = eval_numerical_gradient_array(fc, c, dout)

_, cache = f_forward2_partialt4(a, b, c)
da, db, dc = f_backward2_partialt4(dout, cache)

print('da error: ', rel_error(da_num, da))
print('db error: ', rel_error(db_num, db))
print('dc error: ', rel_error(dc_num, dc))

da error:  1.3232017618086177e-11
db error:  3.2280408425629384e-09
dc error:  0.0


In [30]:
# Final grads
def f_backward2(dout, cache):
    a, b, c = cache
    dm1 = dout
    dm2 = dout
    dt1 = dm1 * 3
    dt2 = dm2 * 5
    da = dt1 / b
    db = (-dt1 * a / b / b) + (dt2 / c)
    dc = -dt2 * b / c / c
    return da, db, dc

In [31]:
np.random.seed(231)
N = 10
a = np.random.randn(N)
b = np.random.randn(N)
c = np.random.randn(N)
dout = np.random.randn(N)

fa = lambda x: f_forward2(a, b, c)[0]
fb = lambda x: f_forward2(a, b, c)[0]
fc = lambda x: f_forward2(a, b, c)[0]

da_num = eval_numerical_gradient_array(fa, a, dout)
db_num = eval_numerical_gradient_array(fb, b, dout)
dc_num = eval_numerical_gradient_array(fc, c, dout)

_, cache = f_forward2(a, b, c)
da, db, dc = f_backward2(dout, cache)

print('da error: ', rel_error(da_num, da))
print('db error: ', rel_error(db_num, db))
print('dc error: ', rel_error(dc_num, dc))

da error:  1.3232017618086177e-11
db error:  3.0250542430511404e-09
dc error:  4.0476731262406694e-08
