### Initialization

In [1]:
# For Colab only!

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [1]:
import tensorflow as tf

In [2]:
import torch
from torch.nn import functional as F

In [3]:
print(tf.__version__)
print(tf.test.is_gpu_available())

2.1.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [4]:
print(torch.__version__)
print(torch.cuda.is_available())

1.4.0
True


### MSE gradient

In [39]:
def one_hot(label, depth):
    out = torch.zeros(label.size(0), depth)
    idx = torch.LongTensor(label).view(-1, 1)
    out.scatter_(dim=1, index=idx, value=1)
    return out

In [12]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = tf.random.uniform([3,4])
w = tf.random.uniform([4,2])
b = tf.zeros([2])
y = tf.constant([0, 1, 1])

with tf.GradientTape() as tape:
    # if the tensors are not variables
    tape.watch([w,b])
    
    logits = x @ w + b
    probs = tf.nn.softmax(logits)
    
    y_true = tf.one_hot(y, depth=2)
    
    losses = tf.losses.MSE(y_true,probs)
    loss = tf.reduce_mean(losses)
    
grads = tape.gradient(loss, [w,b])

grads_w = grads[0]
grads_b = grads[1]

print(loss)
print(grads[0])
print(grads[1])

tf.Tensor(0.23290308, shape=(), dtype=float32)
tf.Tensor(
[[ 0.00120118 -0.00120119]
 [ 0.01929211 -0.01929212]
 [ 0.03523264 -0.03523265]
 [ 0.04114018 -0.04114018]], shape=(4, 2), dtype=float32)
tf.Tensor([ 0.07563752 -0.07563753], shape=(2,), dtype=float32)


In [40]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = torch.rand(3,4)
w = torch.rand([4,2], requires_grad=True)
b = torch.zeros([2], requires_grad=True)
y = torch.LongTensor([0, 1, 1])

# if "requires_grad=Flase"
# w.requires_grad_()
# b.requires_grad_()

logits = x @ w +b
probs = F.softmax(logits, dim = 1)

y_true = one_hot(y, depth=2)
loss = F.mse_loss(y_true, probs)



grads = torch.autograd.grad(loss, [w, b])

grads_w = grads[0]
grads_b = grads[1]


print(loss)
print(grads_w)
print(grads_b)

# Alternative way:

# loss.backward()
# print(w.grad)
# print(b.grad)

tensor(0.2377, grad_fn=<MeanBackward0>)
tensor([[-0.0021,  0.0021],
        [ 0.0981, -0.0981],
        [ 0.0548, -0.0548],
        [ 0.0253, -0.0253]])
tensor([ 0.0360, -0.0360])


### Sorftmax

In [27]:
logits = tf.random.uniform([3,3])
logits = tf.Variable(logits)

with tf.GradientTape() as tape:
#     tape.watch([logits])
    
    probs = tf.nn.softmax(logits, axis=1)
    
grads = tape.gradient(probs, logits)


# print(logits)
# print(probs)

print(probs[0][0])
print(grads)



tf.Tensor(0.29163083, shape=(), dtype=float32)
tf.Tensor(
[[1.7382552e-08 1.6526526e-08 2.5695563e-08]
 [1.5146206e-08 1.5726819e-08 2.8731616e-08]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00]], shape=(3, 3), dtype=float32)


In [29]:
# logist: [b, 3], probs: [b, 3]
logits = torch.rand(3,3)
logits.requires_grad_()

probs = F.softmax(logits, dim = 1)

print(logits)
print(probs)


probs[0][0].backward(retain_graph=True)
print(logits.grad)

# probs[1][1].backward(retain_graph=True)
# print(logits.grad)

# probs[2][2].backward(retain_graph=True)
# print(logits.grad)

# probs[0][1].backward(retain_graph=True)
# print(logits.grad)



# grad_0_0 = torch.autograd.grad(probs[0][0], logits, retain_graph=True)
# print(grad_0_0)

# grad_1_1 = torch.autograd.grad(probs[1][1], logits, retain_graph=True)
# print(grad_1_1)

tensor([[0.3427, 0.7886, 0.0762],
        [0.9033, 0.4702, 0.1778],
        [0.4810, 0.8037, 0.6519]], requires_grad=True)
tensor([[0.3005, 0.4693, 0.2302],
        [0.4689, 0.3041, 0.2270],
        [0.2803, 0.3871, 0.3326]], grad_fn=<SoftmaxBackward>)
tensor([[ 0.2102, -0.1410, -0.0692],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000]])


### Crossentropy gradient

In [41]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = tf.random.uniform([3,4])
w = tf.random.uniform([4,2])
b = tf.zeros([2])
y = tf.constant([0, 1, 1])

with tf.GradientTape() as tape:
    tape.watch([w,b])
    y_true = tf.one_hot(y, depth=2)
    logits = x@w + b
    losses = tf.losses.categorical_crossentropy(y_true, 
                                                logits, 
                                                from_logits=True)
    loss = tf.reduce_mean(losses)
    
grads = tape.gradient(loss, [w,b])  

grad_w = grads[0]
grad_b = grads[1]

print(loss)
print(grad_w)
print(grad_b)

tf.Tensor(0.6436507, shape=(), dtype=float32)
tf.Tensor(
[[ 0.12311882 -0.12311883]
 [ 0.11273158 -0.11273161]
 [ 0.17509589 -0.1750959 ]
 [ 0.17360634 -0.17360635]], shape=(4, 2), dtype=float32)
tf.Tensor([ 0.10200089 -0.10200092], shape=(2,), dtype=float32)


In [49]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot not requried here

x = torch.rand(3,4)
w = torch.rand(4,2, requires_grad=True)
b = torch.zeros(2, requires_grad=True)
y = torch.LongTensor([0, 1, 1])

logits = x@w + b
# must use logits rather than probs, one-hot encoding not required
loss = F.cross_entropy(logits, y)

loss.bac()

print(loss)
print(w.grad)
print(b.grad)

AttributeError: 'Tensor' object has no attribute 'backword'