In [2]:
import numpy as np
import tensorflow as tf

# 07-03: Convolution Operation Naive Backward Pass

Try calculating NumPy, then check with TensorFlow and numerical gradient

#### 1. Basic
$(4 \times 4) * (3 \times 3) = (2 \times 2)$

#### 2. Padding
$(4 \times 4) * (3 \times 3) = (4 \times 4)$ where $P=1$

#### 3. Stride
$(7 \times 7) * (3 \times 3) = (3 \times 3)$ where $S=2$

#### 4. Padding and Stride
$(7 \times 7) * (3 \times 3) = (4 \times 4)$ where $P=1, S=2$

#### 5. Channel
$(4 \times 4 \times 3) * (3 \times 3 \times 3) = (2 \times 2)$

#### 6. Channel and bias 
$(4 \times 4 \times 3) * (3 \times 3 \times 3) + (1) = (2 \times 2)$

#### 7. Multiple Filters
$(4 \times 4 \times 3) * (3 \times 3 \times 3 \times 4) = (2 \times 2 \times 4)$

#### 8.Multiple Filters + bias 
$(4 \times 4 \times 3) * (3 \times 3 \times 3 \times 4) + (4)= (2 \times 2 \times 4)$

#### 9. Mini-batch + bias
$(3 \times 4 \times 4 \times 3) * (3 \times 3 \times 3 \times 4) + (4)= (3 \times 2 \times 2 \times 4)$

#### 10. RGB Mini-batch $*$ Multiple Filters with stride and padding
$(3 \times 7 \times 7 \times 3) * (3 \times 3 \times 3 \times 4) + (4)= (3 \times 4 \times 4 \times 4)$ where $P=1, S=2$


In [3]:
def float_sequence(size):
    return np.arange(size, dtype=np.float32)

### 1. Basic Convolution Backward

$(4 \times 4) * (3 \times 3) = (2 \times 2)$

In [29]:
X = float_sequence(4*4).reshape(4,4)
W = 12 - float_sequence(3*3).reshape(3,3)

dY = np.ones((2,2))

print("=== dY ===")     
print(dY)

dX = np.zeros((4,4))
dW = np.zeros((3,3))

for h in range(4-3+1):
    for w in range(4-3+1):
        h_start = h
        h_end   = h_start + 3
        w_start = w
        w_end   = w_start + 3
        
        current_dY = dY[h, w]
        dX[h_start:h_end, w_start:w_end] += current_dY * W
        dW += current_dY * X[h_start:h_end, w_start:w_end]
        
print("=== dX ===")     
print(dX)
        
print("=== dW ===")     
print(dW)

with tf.Session() as sess:
    tf_X = tf.constant(X.reshape(1, 4, 4, 1))
    tf_W = tf.Variable(W.reshape(3, 3, 1, 1))
    tf_Y = tf.nn.conv2d(tf_X, tf_W, strides=[1, 1, 1, 1], padding='VALID')
    tf_L = tf.reduce_sum(tf_Y)
    tf_grad = tf.gradients(tf_L, [tf_X, tf_W])
    
    sess.run(tf.global_variables_initializer())
    tf_L_val = sess.run(tf_L)
    tf_grad_val = sess.run(tf_grad)
    print("=== L (tf) ===")     
    print(tf_L_val)
    print("=== dX (tf) ===")     
    print(tf_grad_val[0][0, :, :, 0])
    print("=== dW (tf) ===")     
    print(tf_grad_val[1][:, :, 0, 0])

print("=== Matched? ===")    
print("dX: ", np.all(dX == tf_grad_val[0][0, :, :, 0]))
print("dY: ", np.all(dW == tf_grad_val[1][:, :, 0, 0]))

=== dY ===
[[ 1.  1.]
 [ 1.  1.]]
=== dX ===
[[ 12.  23.  21.  10.]
 [ 21.  40.  36.  17.]
 [ 15.  28.  24.  11.]
 [  6.  11.   9.   4.]]
=== dW ===
[[ 10.  14.  18.]
 [ 26.  30.  34.]
 [ 42.  46.  50.]]
=== L (tf) ===
1848.0
=== dX (tf) ===
[[ 12.  23.  21.  10.]
 [ 21.  40.  36.  17.]
 [ 15.  28.  24.  11.]
 [  6.  11.   9.   4.]]
=== dW (tf) ===
[[ 10.  14.  18.]
 [ 26.  30.  34.]
 [ 42.  46.  50.]]
=== Matched? ===
dX:  True
dY:  True


### 2. Convolution with padding

$(4 \times 4) * (3 \times 3) = (4 \times 4)$ where $P=1$

In [38]:
X_org = float_sequence(4*4).reshape(4,4)
P = 1
X = np.pad(X_org, ((P, P), (P, P)), 'constant')
W = 12 - float_sequence(3*3).reshape(3,3)

dY = np.ones((4,4))

dX = np.zeros((6,6))
dW = np.zeros((3,3))

for h in range(4):
    for w in range(4):
        h_start = h
        h_end   = h_start + 3
        w_start = w
        w_end   = w_start + 3
        
        current_dY = dY[h, w]
        dX[h_start:h_end, w_start:w_end] += current_dY * W
        dW += current_dY * X[h_start:h_end, w_start:w_end]

dX = dX[P:-P, P:-P] # unpad
print("=== dX ===")
print(dX)
        
print("=== dW ===")     
print(dW)

with tf.Session() as sess:
    tf_X = tf.constant(X_org.reshape(1, 4, 4, 1))
    tf_W = tf.Variable(W.reshape(3, 3, 1, 1))
    tf_Y = tf.nn.conv2d(tf_X, tf_W, strides=[1, 1, 1, 1], padding='SAME')
    tf_L = tf.reduce_sum(tf_Y)
    tf_grad = tf.gradients(tf_L, [tf_X, tf_W])
    
    sess.run(tf.global_variables_initializer())
    tf_L_val = sess.run(tf_L)
    tf_grad_val = sess.run(tf_grad)
    print("=== dX (tf) ===")     
    print(tf_grad_val[0][0, :, :, 0])
    print("=== dW (tf) ===")     
    print(tf_grad_val[1][:, :, 0, 0])

print("=== Matched? ===")    
print("dX: ", np.all(dX == tf_grad_val[0][0, :, :, 0]))
print("dY: ", np.all(dW == tf_grad_val[1][:, :, 0, 0]))

=== dX ===
[[ 40.  57.  57.  36.]
 [ 51.  72.  72.  45.]
 [ 51.  72.  72.  45.]
 [ 28.  39.  39.  24.]]
=== dW ===
[[  45.   66.   54.]
 [  84.  120.   96.]
 [  81.  114.   90.]]
=== dX (tf) ===
[[ 40.  57.  57.  36.]
 [ 51.  72.  72.  45.]
 [ 51.  72.  72.  45.]
 [ 28.  39.  39.  24.]]
=== dW (tf) ===
[[  45.   66.   54.]
 [  84.  120.   96.]
 [  81.  114.   90.]]
=== Matched? ===
dX:  True
dY:  True


### 3. Convolution with Stride

$(7 \times 7) * (3 \times 3) = (3 \times 3)$ where $S=2$

In [43]:
X = float_sequence(7*7).reshape(7,7)
W = 12 - float_sequence(3*3).reshape(3,3)
S = 2

dY = np.ones((3,3))

dX = np.zeros((7,7))
dW = np.zeros((3,3))

for h in range(3):
    for w in range(3):
        h_start = h * S
        h_end   = h_start + 3
        w_start = w * S
        w_end   = w_start + 3
        
        current_dY = dY[h, w]
        dX[h_start:h_end, w_start:w_end] += current_dY * W
        dW += current_dY * X[h_start:h_end, w_start:w_end]
        
print("=== dX ===")     
print(dX)
        
print("=== dW ===")     
print(dW)

with tf.Session() as sess:
    tf_X = tf.constant(X.reshape(1, 7, 7, 1))
    tf_W = tf.Variable(W.reshape(3, 3, 1, 1))
    tf_Y = tf.nn.conv2d(tf_X, tf_W, strides=[1, S, S, 1], padding='VALID')
    tf_L = tf.reduce_sum(tf_Y)
    tf_grad = tf.gradients(tf_L, [tf_X, tf_W])
    
    sess.run(tf.global_variables_initializer())
    tf_L_val = sess.run(tf_L)
    tf_grad_val = sess.run(tf_grad)
    print("=== L (tf) ===")     
    print(tf_L_val)
    print("=== dX (tf) ===")     
    print(tf_grad_val[0][0, :, :, 0])
    print("=== dW (tf) ===")     
    print(tf_grad_val[1][:, :, 0, 0])

print("=== Matched? ===")    
print("dX: ", np.all(dX == tf_grad_val[0][0, :, :, 0]))
print("dY: ", np.all(dW == tf_grad_val[1][:, :, 0, 0]))

=== dX ===
[[ 12.  11.  22.  11.  22.  11.  10.]
 [  9.   8.  16.   8.  16.   8.   7.]
 [ 18.  16.  32.  16.  32.  16.  14.]
 [  9.   8.  16.   8.  16.   8.   7.]
 [ 18.  16.  32.  16.  32.  16.  14.]
 [  9.   8.  16.   8.  16.   8.   7.]
 [  6.   5.  10.   5.  10.   5.   4.]]
=== dW ===
[[ 144.  153.  162.]
 [ 207.  216.  225.]
 [ 270.  279.  288.]]
=== L (tf) ===
14364.0
=== dX (tf) ===
[[ 12.  11.  22.  11.  22.  11.  10.]
 [  9.   8.  16.   8.  16.   8.   7.]
 [ 18.  16.  32.  16.  32.  16.  14.]
 [  9.   8.  16.   8.  16.   8.   7.]
 [ 18.  16.  32.  16.  32.  16.  14.]
 [  9.   8.  16.   8.  16.   8.   7.]
 [  6.   5.  10.   5.  10.   5.   4.]]
=== dW (tf) ===
[[ 144.  153.  162.]
 [ 207.  216.  225.]
 [ 270.  279.  288.]]
=== Matched? ===
dX:  True
dY:  True


### 4. Padding and Stride

$(7 \times 7) * (3 \times 3) = (4 \times 4)$ where $P=1, S=2$

In [52]:
P = 1
S = 2
X_org = float_sequence(7*7).reshape(7,7)
X = np.pad(X_org, ((P, P), (P, P)), 'constant')
W = 12 - float_sequence(3*3).reshape(3,3)

# print("=== X ===")
# print(X)
# print("=== W ===")
# print(W)

dY = np.ones((4,4))

dX = np.zeros((9,9))
dW = np.zeros((3,3))

for h in range(4):
    for w in range(4):
        h_start = h * S
        h_end   = h_start + 3
        w_start = w * S
        w_end   = w_start + 3
        
        current_dY = dY[h, w]
        dX[h_start:h_end, w_start:w_end] += current_dY * W
        dW += current_dY * X[h_start:h_end, w_start:w_end]

dX = dX[P:-P, P:-P] # unpad
print("=== dX ===")     
print(dX)
        
print("=== dW ===")     
print(dW)

with tf.Session() as sess:
    tf_X = tf.constant(X_org.reshape(1, 7, 7, 1))
    tf_W = tf.Variable(W.reshape(3, 3, 1, 1))
    tf_Y = tf.nn.conv2d(tf_X, tf_W, strides=[1, S, S, 1], padding='SAME')
    tf_L = tf.reduce_sum(tf_Y)
    tf_grad = tf.gradients(tf_L, [tf_X, tf_W])
    
    sess.run(tf.global_variables_initializer())
    tf_L_val = sess.run(tf_L)
    tf_grad_val = sess.run(tf_grad)
    print("=== L (tf) ===")     
    print(tf_L_val)
    print("=== dX (tf) ===")     
    print(tf_grad_val[0][0, :, :, 0])
    print("=== dW (tf) ===")     
    print(tf_grad_val[1][:, :, 0, 0])

# print("=== Matched? ===")    
print("dX: ", np.all(dX == tf_grad_val[0][0, :, :, 0]))
print("dY: ", np.all(dW == tf_grad_val[1][:, :, 0, 0]))

=== dX ===
[[  8.  16.   8.  16.   8.  16.   8.]
 [ 16.  32.  16.  32.  16.  32.  16.]
 [  8.  16.   8.  16.   8.  16.   8.]
 [ 16.  32.  16.  32.  16.  32.  16.]
 [  8.  16.   8.  16.   8.  16.   8.]
 [ 16.  32.  16.  32.  16.  32.  16.]
 [  8.  16.   8.  16.   8.  16.   8.]]
=== dW ===
[[ 216.  288.  216.]
 [ 288.  384.  288.]
 [ 216.  288.  216.]]
=== L (tf) ===
19200.0
=== dX (tf) ===
[[  8.  16.   8.  16.   8.  16.   8.]
 [ 16.  32.  16.  32.  16.  32.  16.]
 [  8.  16.   8.  16.   8.  16.   8.]
 [ 16.  32.  16.  32.  16.  32.  16.]
 [  8.  16.   8.  16.   8.  16.   8.]
 [ 16.  32.  16.  32.  16.  32.  16.]
 [  8.  16.   8.  16.   8.  16.   8.]]
=== dW (tf) ===
[[ 216.  288.  216.]
 [ 288.  384.  288.]
 [ 216.  288.  216.]]
dX:  True
dY:  True


### 5. Channel

$(4 \times 4 \times 3) * (3 \times 3 \times 3) = (2 \times 2)$

In [67]:
X = float_sequence(4*4*3).reshape(4,4,3)
W = 30 - float_sequence(3*3*3).reshape(3,3,3)
# print("=== X ===")
# print(X.transpose(2, 0, 1))
# print("=== W ===")
# print(W.transpose(2, 0, 1))

dY = np.ones((2,2))

# print("=== dY ===")     
# print(dY)

dX = np.zeros((4,4,3))
dW = np.zeros((3,3,3))

for h in range(4-3+1):
    for w in range(4-3+1):
        h_start = h
        h_end   = h_start + 3
        w_start = w
        w_end   = w_start + 3
        
        current_dY = dY[h, w]
        dX[h_start:h_end, w_start:w_end, :] += current_dY * W
        dW += current_dY * X[h_start:h_end, w_start:w_end, :]
        
# print("=== dX ===")     
# print(dX)
        
# print("=== dW ===")     
# print(dW)

with tf.Session() as sess:
    tf_X = tf.constant(X.reshape(1, 4, 4, 3))
    tf_W = tf.Variable(W.reshape(3, 3, 3, 1))
    tf_Y = tf.nn.conv2d(tf_X, tf_W, strides=[1, 1, 1, 1], padding='VALID')
    tf_L = tf.reduce_sum(tf_Y)
    tf_grad = tf.gradients(tf_L, [tf_X, tf_W])
    
    sess.run(tf.global_variables_initializer())
    tf_L_val = sess.run(tf_L)
    tf_grad_val = sess.run(tf_grad)
    print("=== L (tf) ===")     
    print(tf_L_val)
    print("=== dX (tf) ===")     
    print(tf_grad_val[0][0, :, :, :])
#     print("=== dW (tf) ===")     
#     print(tf_grad_val[1][:, :, :, 0])

print("=== Matched? ===")    
print("dX: ", np.all(dX == tf_grad_val[0][0, :, :, :]))
print("dY: ", np.all(dW == tf_grad_val[1][:, :, :, 0]))

=== L (tf) ===
34650.0
=== dX (tf) ===
[[[ 30.  29.  28.]
  [ 57.  55.  53.]
  [ 51.  49.  47.]
  [ 24.  23.  22.]]

 [[ 51.  49.  47.]
  [ 96.  92.  88.]
  [ 84.  80.  76.]
  [ 39.  37.  35.]]

 [[ 33.  31.  29.]
  [ 60.  56.  52.]
  [ 48.  44.  40.]
  [ 21.  19.  17.]]

 [[ 12.  11.  10.]
  [ 21.  19.  17.]
  [ 15.  13.  11.]
  [  6.   5.   4.]]]
=== Matched? ===
dX:  True
dY:  True


### 6. Channel and bias 

$(4 \times 4 \times 3) * (3 \times 3 \times 3) + (1) = (2 \times 2)$

In [80]:
X = float_sequence(4*4*3).reshape(4,4,3)
W = 30 - float_sequence(3*3*3).reshape(3,3,3)
b = np.array([10], dtype=np.float32)
# print("=== X ===")
# print(X.transpose(2, 0, 1))
# print("=== W ===")
# print(W.transpose(2, 0, 1))

dY = np.ones((2,2))

# print("=== dY ===")     
# print(dY)


db = np.sum(dY, keepdims=True)
dX = np.zeros((4,4,3))
dW = np.zeros((3,3,3))

for h in range(4-3+1):
    for w in range(4-3+1):
        h_start = h
        h_end   = h_start + 3
        w_start = w
        w_end   = w_start + 3
        
        current_dY = dY[h, w]
        dX[h_start:h_end, w_start:w_end, :] += current_dY * W
        dW += current_dY * X[h_start:h_end, w_start:w_end, :]
        
# print("=== dX ===")     
# print(dX)
        
# print("=== dW ===")     
# print(dW)

print("=== db ===")     
print(db)

with tf.Session() as sess:
    tf_X = tf.constant(X.reshape(1, 4, 4, 3))
    tf_W = tf.Variable(W.reshape(3, 3, 3, 1))
    tf_b = tf.constant(b)
    tf_Y = tf.nn.conv2d(tf_X, tf_W, strides=[1, 1, 1, 1], padding='VALID') + tf_b
    tf_L = tf.reduce_sum(tf_Y)
    tf_grad = tf.gradients(tf_L, [tf_X, tf_W, tf_b])
    
    sess.run(tf.global_variables_initializer())
    tf_L_val = sess.run(tf_L)
    tf_grad_val = sess.run(tf_grad)
#     print("=== L (tf) ===")     
#     print(tf_L_val)
#     print("=== dX (tf) ===")     
#     print(tf_grad_val[0][0, :, :, :])
#     print("=== dW (tf) ===")     
#     print(tf_grad_val[1][:, :, :, 0])
    print("=== db (tf) ===")     
    print(tf_grad_val[2])

print("=== Matched? ===")    
print("dX: ", np.all(dX == tf_grad_val[0][0, :, :, :]))
print("dY: ", np.all(dW == tf_grad_val[1][:, :, :, 0]))
# print("db: ", np.all(db == tf_grad_val[2]))

=== db ===
[[ 4.]]
=== db (tf) ===
[ 4.]
=== Matched? ===
dX:  True
dY:  True


### 7. Multiple Filters

$(4 \times 4 \times 3) * (3 \times 3 \times 3 \times 4) = (2 \times 2 \times 4)$

### 8. Multiple Filters + bias 

$(4 \times 4 \times 3) * (3 \times 3 \times 3 \times 4) + (4)= (2 \times 2 \times 4)$

### 9. Mini-batch + bias

$(3 \times 4 \times 4 \times 3) * (3 \times 3 \times 3 \times 4) + (4)= (3 \times 2 \times 2 \times 4)$

#### 10. RGB Mini-batch $*$ Multiple Filters with stride and padding
$(3 \times 7 \times 7 \times 3) * (3 \times 3 \times 3 \times 4) + (4)= (3 \times 4 \times 4 \times 4)$ where $P=1, S=2$