In [1]:
import numpy as np

N, D, H = 100, 10, 20

x = np.random.randn(N, D)
W = np.random.randn(D, H)
b = np.random.randn(N, H)

a = x.dot(W)
c = a + b
l = np.sum(c)

grad_l = 1.
grad_c = grad_l * np.ones((N, H))
grad_a = grad_c.copy()
grad_b = grad_c.copy()
grad_x = grad_a.dot(W.T)
grad_W = x.T.dot(grad_a)

In [2]:
use_cuda = True

In [3]:
import tensorflow as tf

device = "/gpu:0" if use_cuda else "/cpu:0"
with tf.device(device):
    tf_tensor_x = tf.convert_to_tensor(x)
    tf_tensor_W = tf.convert_to_tensor(W)
    tf_tensor_b = tf.convert_to_tensor(b)

    tf_tensor_a = tf.matmul(tf_tensor_x, tf_tensor_W)
    tf_tensor_c = tf_tensor_a + tf_tensor_b
    tf_tensor_l = tf.reduce_sum(tf_tensor_c)

tf_tensor_grad_x, tf_tensor_grad_W, tf_tensor_grad_b = \
    tf.gradients(tf_tensor_l, [tf_tensor_x, tf_tensor_W, tf_tensor_b])

with tf.Session() as sess:
    result = sess.run([tf_tensor_grad_x, tf_tensor_grad_W, tf_tensor_grad_b])
    tf_val_grad_x, tf_val_grad_W, tf_val_grad_b = result

  return f(*args, **kwds)


In [4]:
import torch
from torch.autograd import Variable

if use_cuda:
    torch_variable_x = Variable(torch.from_numpy(x).cuda(), requires_grad=True)
    torch_variable_W = Variable(torch.from_numpy(W).cuda(), requires_grad=True)
    torch_variable_b = Variable(torch.from_numpy(b).cuda(), requires_grad=True)
else:
    torch_variable_x = Variable(torch.from_numpy(x), requires_grad=True)
    torch_variable_W = Variable(torch.from_numpy(W), requires_grad=True)
    torch_variable_b = Variable(torch.from_numpy(b), requires_grad=True)

torch_variable_a = torch_variable_x.matmul(torch_variable_W)
torch_variable_c = torch_variable_a + torch_variable_b
torch_variable_l = torch.sum(torch_variable_c)

torch_variable_l.backward()

torch_val_grad_x = torch_variable_x.grad.data
torch_val_grad_W = torch_variable_W.grad.data
torch_val_grad_b = torch_variable_b.grad.data

In [5]:
print("diff of grad x(numpy vs tf): {}".format(np.linalg.norm(grad_x - tf_val_grad_x)))
print("diff of grad W(numpy vs tf): {}".format(np.linalg.norm(grad_W - tf_val_grad_W)))
print("diff of grad b(numpy vs tf): {}".format(np.linalg.norm(grad_b - tf_val_grad_b)))
print("diff of grad x(numpy vs torch): {}".format(np.linalg.norm(grad_x - torch_val_grad_x)))
print("diff of grad x(numpy vs torch): {}".format(np.linalg.norm(grad_W - torch_val_grad_W)))
print("diff of grad x(numpy vs torch): {}".format(np.linalg.norm(grad_b - torch_val_grad_b)))

diff of grad x(numpy vs tf): 0.0
diff of grad W(numpy vs tf): 0.0
diff of grad b(numpy vs tf): 0.0
diff of grad x(numpy vs torch): 0.0
diff of grad x(numpy vs torch): 0.0
diff of grad x(numpy vs torch): 0.0
