### Initialization

In [1]:
# For Colab only!

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [3]:
import tensorflow as tf

In [4]:
import torch
from torch.nn import functional as F

In [3]:
print(tf.__version__)
print(tf.test.is_gpu_available())

2.1.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [4]:
print(torch.__version__)
print(torch.cuda.is_available())

1.4.0
True


### Simple Gradient

In [22]:
def func(x):
    return x**2 + x**3 + 5

# df(x)/dx = 2x + 3x**2

In [25]:
x = tf.Variable([2.])

with tf.GradientTape() as tape:
    y = func(x)
    
grad = tape.gradient(y, [x])
print(grad[0])

tf.Tensor([16.], shape=(1,), dtype=float32)


In [32]:
x = torch.tensor([2.], requires_grad = True)

y = func(x)

grad = torch.autograd.grad(y, [x])
print(grad[0])

# y.backward()
# print(x.grad)

tensor([16.])


### MSE gradient

In [39]:
def one_hot(label, depth):
    out = torch.zeros(label.size(0), depth)
    idx = torch.LongTensor(label).view(-1, 1)
    out.scatter_(dim=1, index=idx, value=1)
    return out

In [12]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x@w +b  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = tf.random.uniform([3,4])
w = tf.random.uniform([4,2])
b = tf.zeros([2])
y = tf.constant([0, 1, 1])

with tf.GradientTape() as tape:
    # if the tensors are not variables
    tape.watch([w,b])
    
    logits = x @ w + b
    probs = tf.nn.softmax(logits)
    
    y_true = tf.one_hot(y, depth=2)
    
    losses = tf.losses.MSE(y_true,probs)
    loss = tf.reduce_mean(losses)
    
grads = tape.gradient(loss, [w,b])

grads_w = grads[0]
grads_b = grads[1]

print(loss)
print(grads[0])
print(grads[1])

tf.Tensor(0.23290308, shape=(), dtype=float32)
tf.Tensor(
[[ 0.00120118 -0.00120119]
 [ 0.01929211 -0.01929212]
 [ 0.03523264 -0.03523265]
 [ 0.04114018 -0.04114018]], shape=(4, 2), dtype=float32)
tf.Tensor([ 0.07563752 -0.07563753], shape=(2,), dtype=float32)


In [40]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x@w +b  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = torch.rand(3,4)
w = torch.rand([4,2], requires_grad=True)
b = torch.zeros([2], requires_grad=True)
y = torch.LongTensor([0, 1, 1])

# if "requires_grad=Flase"
# w.requires_grad_()
# b.requires_grad_()

logits = x @ w +b
probs = F.softmax(logits, dim = 1)

y_true = one_hot(y, depth=2)
loss = F.mse_loss(y_true, probs)



grads = torch.autograd.grad(loss, [w, b])

grads_w = grads[0]
grads_b = grads[1]


print(loss)
print(grads_w)
print(grads_b)

# Alternative way:

# loss.backward()
# print(w.grad)
# print(b.grad)

tensor(0.2377, grad_fn=<MeanBackward0>)
tensor([[-0.0021,  0.0021],
        [ 0.0981, -0.0981],
        [ 0.0548, -0.0548],
        [ 0.0253, -0.0253]])
tensor([ 0.0360, -0.0360])


### Sorftmax

In [75]:
logits = tf.random.uniform([3,3])
logits = tf.Variable(logits)

with tf.GradientTape() as tape:
#     tape.watch([logits])
    
    probs = tf.nn.softmax(logits, axis=1)
    
grads = tape.gradient(probs, [logits])


# print(logits)
# print(probs)

# print(probs[0][0])
print(grads)



[<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       [1.2822345e-08, 2.3484551e-08, 2.3297744e-08],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00]], dtype=float32)>]


In [68]:
# logist: [b, 3], probs: [b, 3]
logits = torch.rand(3,3)
logits.requires_grad_()

probs = F.softmax(logits, dim = 1)

print(logits)
print(probs)

grad_0_0 = torch.autograd.grad(probs[0][0], logits, retain_graph=True)
print(grad_0_0)

# probs[0][0].backward(retain_graph=True)
# print(logits.grad)

# probs[1][1].backward(retain_graph=True)
# print(logits.grad)

# probs[2][2].backward(retain_graph=True)
# print(logits.grad)

# probs[0][1].backward(retain_graph=True)
# print(logits.grad)


tensor([[0.0512, 0.2323, 0.1603],
        [0.4415, 0.2883, 0.0061],
        [0.3152, 0.3054, 0.9709]], requires_grad=True)
tensor([[0.3018, 0.3617, 0.3366],
        [0.3992, 0.3425, 0.2583],
        [0.2553, 0.2528, 0.4919]], grad_fn=<SoftmaxBackward>)
(tensor([[ 0.2107, -0.1091, -0.1016],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000]]),)


### Crossentropy gradient

In [69]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = tf.random.uniform([3,4])
w = tf.random.uniform([4,2])
b = tf.zeros([2])
y = tf.constant([0, 1, 1])

with tf.GradientTape() as tape:
    tape.watch([w,b])
    y_true = tf.one_hot(y, depth=2)
    logits = x@w + b
    losses = tf.losses.categorical_crossentropy(y_true, 
                                                logits, 
                                                from_logits=True)
    loss = tf.reduce_mean(losses)
    
grads = tape.gradient(loss, [w,b])  

grad_w = grads[0]
grad_b = grads[1]

print(loss)
print(grad_w)
print(grad_b)

tf.Tensor(0.7021515, shape=(), dtype=float32)
tf.Tensor(
[[-0.12753755  0.12753753]
 [ 0.11524016 -0.11524017]
 [-0.03575598  0.03575598]
 [ 0.02847505 -0.02847505]], shape=(4, 2), dtype=float32)
tf.Tensor([ 0.10340744 -0.10340746], shape=(2,), dtype=float32)


In [4]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot not requried here

x = torch.rand(3,4)
w = torch.rand(4,2, requires_grad=True)
b = torch.zeros(2, requires_grad=True)
y = torch.LongTensor([0, 1, 1])

logits = x@w + b
# must use logits rather than probs, one-hot encoding not required
loss = F.cross_entropy(logits, y)

loss.backward()

print(loss)
print(w.grad)
print(b.grad)

tensor(0.6820, grad_fn=<NllLossBackward>)
tensor([[ 0.0685, -0.0685],
        [ 0.0468, -0.0468],
        [ 0.0173, -0.0173],
        [ 0.0620, -0.0620]])
tensor([ 0.1484, -0.1484])


### Chain Rule

In [93]:
x1 = tf.random.uniform([1])
w1 = tf.random.uniform([1])
b1 = tf.random.uniform([1])

w2 = tf.random.uniform([1])
b2 = tf.random.uniform([1])

with tf.GradientTape(persistent=True) as tape:
    tape.watch([w1,b1,w2,b2])
    
    y1 = x1*w1 + b1
    y2 = y1*w2 + b2
    
[dy1_dw1] = tape.gradient(y1, [w1])
[dy2_dy1] = tape.gradient(y2, [y1])
     

[dy2_dw1] = tape.gradient(y2, [w1])
print(dy2_dw1)
print(dy2_dw1 == dy2_dy1 * dy1_dw1) 

tf.Tensor([0.20194398], shape=(1,), dtype=float32)
tf.Tensor([ True], shape=(1,), dtype=bool)


In [94]:
x1 = torch.rand(1)
w1 = torch.rand(1, requires_grad=True)
b1 = torch.rand(1, requires_grad=True)

w2 = torch.rand(1, requires_grad=True)
b2 = torch.rand(1, requires_grad=True)

y1 = x1*w1 + b1
y2 = y1*w2 + b2

(dy1_dw1,) = torch.autograd.grad(y1, w1, retain_graph=True)

(dy2_dy1,) = torch.autograd.grad(y2, y1, retain_graph=True)


(dy2_dw1,) = torch.autograd.grad(y2, w1)
print(dy2_dw1)
print(dy2_dy1 * dy1_dw1 == dy2_dw1)


tensor([0.6876])
tensor([True])


In [120]:
x1 = tf.random.uniform([3,4])*10
w1 = tf.random.uniform([4,3])*10
b1 = tf.zeros([3])

w2 = tf.random.uniform([3,2])*10
b2 = tf.zeros([2])

with tf.GradientTape(persistent=True) as tape:
    tape.watch([w1,b1,w2,b2])
    
    y1 = tf.nn.relu(x1@w1 + b1)
    y2 = tf.nn.relu(y1@w2 + b2)
    
[dy1_dw1] = tape.gradient(y1, [w1])
[dy2_dy1] = tape.gradient(y2, [y1])
     
print(dy1_dw1)
print(dy2_dy1)

print(dy2_dy1[0][0] * dy1_dw1[0][0])

[dy2_dw1] = tape.gradient(y2, [w1])

print(dy2_dw1[0][0])

# print(dy2_dw1[0][0] == dy2_dy1[0][0] * dy1_dw1[0][0]) 

tf.Tensor(
[[17.540981 17.540981 17.540981]
 [18.754086 18.754086 18.754086]
 [ 8.839819  8.839819  8.839819]
 [19.85136  19.85136  19.85136 ]], shape=(4, 3), dtype=float32)
tf.Tensor(
[[16.694921  8.69648  17.586002]
 [16.694921  8.69648  17.586002]
 [16.694921  8.69648  17.586002]], shape=(3, 3), dtype=float32)
tf.Tensor(292.8453, shape=(), dtype=float32)
tf.Tensor(292.84534, shape=(), dtype=float32)


In [118]:
x1 = torch.rand(3,4)
w1 = torch.rand(4,3, requires_grad=True)
b1 = torch.zeros(3, requires_grad=True)

w2 = torch.rand(3,2, requires_grad=True)
b2 = torch.zeros(2, requires_grad=True)

y1 = F.relu(x1@w1 + b1)
y2 = F.softmax(y1@w2 + b2, dim = 1)

(dy1_dw1,) = torch.autograd.grad(y1[0][0], w1, retain_graph=True)

(dy2_dy1,) = torch.autograd.grad(y2[0][0], y1, retain_graph=True)

print(dy2_dy1)
print(dy1_dw1)

print(dy2_dy1[0][0] * dy1_dw1[0][0])

(dy2_dw1,)= torch.autograd.grad(y2[0][0], w1)
print(dy2_dw1[0][0])

print(dy2_dy1[0][0] * dy1_dw1[0][0] == dy2_dw1[0][0])

tensor([[-0.1262,  0.0498,  0.0815],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000]])
tensor([[0.3605, 0.0000, 0.0000],
        [0.4910, 0.0000, 0.0000],
        [0.8196, 0.0000, 0.0000],
        [0.7047, 0.0000, 0.0000]])
tensor(-0.0455)
tensor(-0.0455)
tensor(True)


### Opimization

**Himmelblau function**

\begin{equation}
f(x, y)=\left(x^{2}+y-11\right)^{2}+\left(x+y^{2}-7\right)^{2}
\end{equation}

zero points: [3, 2],  [-2.805118, 3.131312],  [-3.779310, -3.283186],  [3.583328, -1.848126]

In [34]:
import  numpy as np
from    mpl_toolkits.mplot3d import Axes3D
from    matplotlib import pyplot as plt



def himmelblau(x):
    return (x[0] ** 2 + x[1] - 11) ** 2 + (x[0] + x[1] ** 2 - 7) ** 2


x = np.arange(-6, 6, 0.1)
y = np.arange(-6, 6, 0.1)
print('x,y range:', x.shape, y.shape)
X, Y = np.meshgrid(x, y)
print('X,Y maps:', X.shape, Y.shape)
Z = himmelblau([X, Y])

fig = plt.figure('himmelblau')
ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, Z)
ax.view_init(60, -30)
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()

x,y range: (120,) (120,)
X,Y maps: (120, 120) (120, 120)


<Figure size 640x480 with 1 Axes>

#### Without optimizer

In [45]:
x = tf.Variable([0.,0.])

lr = 0.001

for step in range(30000):
    
    with tf.GradientTape() as tape:
        pred = himmelblau(x)
    
    grads = tape.gradient(pred, [x])
    x.assign_sub(lr * grads[0])

    if(step % 2000 == 0):
        print('step {}: x = {}, pred = {}'
             .format(step, x.numpy(), pred.numpy()))


step 0: x = [0.014 0.022], pred = 170.0
step 2000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 4000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 6000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 8000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 10000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 12000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 14000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 16000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 18000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 20000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 22000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 24000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 26000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10
step 28000: x = [2.9999971 2.000005 ], pred = 4.483808879740536e-10


In [50]:
x = torch.tensor([0.,0.], requires_grad=True)

lr = 0.001

for step in range(30000):
    
    pred = himmelblau(x)
    
    grads = torch.autograd.grad(pred, [x])
    x.data.sub_(lr * grads[0])
    
    if(step % 2000 == 0):
        print('step {}: x = {}, pred = {}'
             .format(step, x.tolist(), pred.item()))


step 0: x = [0.014000000432133675, 0.02200000174343586], pred = 170.0
step 2000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 4000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 6000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 8000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 10000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 12000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 14000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 16000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 18000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 20000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10
step 22000: x = [2.999997138977051, 2.000005006790161], pred = 4.483808879740536e-10

#### With Optimizer

In [None]:
x = tf.Variable([0.,0.])

lr = 0.001
optimizer = tf.optimizers.SGD(lr)

for step in range(30000):
    
    with tf.GradientTape() as tape:
        pred = himmelblau(x)
    
    grads = tape.gradient(pred, [x])
    
    optimizer.apply_gradients(grads_and_vars = zip(grads, [x]))

    if(step % 2000 == 0):
        print('step {}: x = {}, pred = {}'
             .format(step, x.numpy(), pred.numpy()))


In [None]:
x = torch.tensor([0.,0.], requires_grad=True)

lr = 0.001
optimizer = torch.optim.SGD([x],lr=lr)

for step in range(30000):
    
    pred = himmelblau(x)
    
    optimizer.zero_grad()
    pred.backward()
    optimizer.step()
    
    if(step % 2000 == 0):
        print('step {}: x = {}, pred = {}'
             .format(step, x.tolist(), pred.item()))
