### Torch Tensor Playground 2
9.18.2020

Beginner Tutorial
https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

Tensors Reference Sheet
https://pytorch.org/docs/stable/tensors.html



**Old:**

Basic "w3 school" tutorial
https://deeplizard.com/learn/video/fCVuiW9AFzY

Reference Doc
https://pytorch.org/docs/stable/torch.html
    


In [1]:
import torch
from torch import tensor

In [2]:
tensor([1,2])

tensor([1, 2])

In [3]:
t = tensor([[1,1,1,1],[2,2,2,2],[3,3,3,3]], dtype=torch.float32)

In [5]:
t

tensor([[1., 1., 1., 1.],
        [2., 2., 2., 2.],
        [3., 3., 3., 3.]])

In [4]:
t.size()

torch.Size([3, 4])

In [6]:
t.shape

torch.Size([3, 4])

In [7]:
len(t)

3

In [8]:
t.numel()

12

In [9]:
t.reshape([1,12])

tensor([[1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.]])

In [10]:
t.reshape([2,6])

tensor([[1., 1., 1., 1., 2., 2.],
        [2., 2., 3., 3., 3., 3.]])

In [11]:
a = torch.tensor([[1,1],[2,1],[3,2]])
b = torch.tensor([[-1,1],[2,2],[1,4]])

In [12]:
a

tensor([[1, 1],
        [2, 1],
        [3, 2]])

In [13]:
b

tensor([[-1,  1],
        [ 2,  2],
        [ 1,  4]])

In [14]:
a - b

tensor([[ 2,  0],
        [ 0, -1],
        [ 2, -2]])

In [15]:
a.shape, b.shape

(torch.Size([3, 2]), torch.Size([3, 2]))

### Interchange with `numpy`
https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html#numpy-bridge

It creates a reference instead of copying through some methods

In [16]:
import numpy as np

In [18]:
start_np = np.array([1,2,3])
start_tc = torch.tensor([4,5,6])

In [19]:
start_np

array([1, 2, 3])

In [20]:
start_tc

tensor([4, 5, 6])

In [21]:
cvt_np = start_tc.numpy()
cvt_np

array([4, 5, 6])

In [22]:
cvt_np[-1] = 99
start_tc

tensor([ 4,  5, 99])

In [23]:
cvt_tc = torch.from_numpy(start_np)

In [25]:
cvt_tc[-1] = 555
start_np

array([  1,   2, 555])

In [26]:
create_tc = torch.tensor(start_np)

In [27]:
create_tc

tensor([  1,   2, 555])

In [28]:
create_tc[-1] = 1717
start_np

array([  1,   2, 555])

In [29]:
a = torch.tensor([1,2,3])
a

tensor([1, 2, 3])

In [30]:
a.add(99)

tensor([100, 101, 102])

In [31]:
a

tensor([1, 2, 3])

In [32]:
a.add_(-1)

tensor([0, 1, 2])

In [33]:
a

tensor([0, 1, 2])

In [40]:
a.reshape((1,3)).t().reshape((3,1))

tensor([[0],
        [1],
        [2]])

In [60]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

In [47]:
x.shape

torch.Size([64, 1000])

In [48]:
w1.shape

torch.Size([1000, 100])

In [49]:
l2 = x.mm(w1)
l2.shape

torch.Size([64, 100])

In [50]:
w2.shape

torch.Size([100, 10])

In [51]:
l3 = l2.mm(w2)

In [52]:
l3.shape

torch.Size([64, 10])

In [53]:
y.shape

torch.Size([64, 10])

In [62]:
loss = (l3 - y).pow(2).sum()


In [63]:
loss

tensor(14316466., grad_fn=<SumBackward0>)

In [64]:
loss.item()

14316466.0

In [65]:
loss.tolist()

14316466.0

In [66]:
loss

tensor(14316466., grad_fn=<SumBackward0>)

##### Experiment-----

In [124]:
y_pred = x.mm(w1).clamp(min=0).mm(w2)

loss = (y_pred - y).pow(2).sum()

loss.item()

5.953886102361139e-06

In [125]:
old_w1 = w1.detach().clone()

(old_w1 == w1).all().item()

True

In [126]:
# how to copy with grad is unknonw?
# old_w1 = w1.detach().clone().requires_grad_(True)
# old_w1 = torch.tensor(w1, requires_grad=True)
# old_w1.grad

old_w1_grad = w1.grad.clone()

(old_w1_grad == w1.grad).all()

tensor(True)

In [127]:
loss.backward()

In [128]:
(old_w1 == w1).all().item()

True

In [130]:
(old_w1_grad == w1.grad).all().item()

False

#####  ## ------------------------

In [162]:
# Reset!
N, D_in, H, D_out = 64, 1000, 100, 10

# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [163]:
sgd = torch.optim.SGD((w1,w2), lr=learning_rate)

In [164]:
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if (t) % 100 == 0:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    sgd.step()
    sgd.zero_grad()
#     with torch.no_grad():
#         w1 -= learning_rate * w1.grad
#         w2 -= learning_rate * w2.grad

#         # Manually zero the gradients after updating weights
#         w1.grad.zero_()
#         w2.grad.zero_()

0 36007488.0
100 411.0628356933594
200 1.6606816053390503
300 0.01072639785706997
400 0.00023579603293910623


In [152]:
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if (t) % 100 == 0:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 31284054.0
100 297.78515625
200 0.7103790044784546
300 0.002740600612014532
400 9.063919424079359e-05


In [45]:
x.shape

torch.Size([64, 1000])

In [136]:
w = torch.tensor([1,2,3], requires_grad=True, dtype=torch.float)
w


tensor([1., 2., 3.], requires_grad=True)

In [137]:
sgd = torch.optim.SGD(
        (torch.tensor([1,2,3], requires_grad=True, dtype=torch.float),),
        1e-3,
)
     

In [138]:
sgd.state_dict

<bound method Optimizer.state_dict of SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0
    nesterov: False
    weight_decay: 0
)>

In [144]:
sgd.step()

In [141]:
sgd.defaults

{'lr': 0.001,
 'momentum': 0,
 'dampening': 0,
 'weight_decay': 0,
 'nesterov': False}

In [143]:
sgd.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [140508626779424]}]}

In [146]:
sgd.zero_grad()

In [147]:
sgd.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [140508626779424]}]}

In [148]:
sgd

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [149]:
sgd.param_groups

[{'params': [tensor([1., 2., 3.], requires_grad=True)],
  'lr': 0.001,
  'momentum': 0,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False}]