<a href="https://colab.research.google.com/gist/ejmejm/1baeddbbe48f58dbced9c019c25ebf71/pytorch_tips_yt_follow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 7 PyTorch Tips You Should Know

In [1]:
import time

import torch
from torch import nn

# 1. Create Tensors Directly on the Target Device

In [3]:
start_time = time.time()

for _ in range(100):
  # Creating on the CPU, then transfering to the GPU
  cpu_tensor = torch.ones((1000, 64, 64))
  gpu_tensor = cpu_tensor.cuda()

print('Total time: {:.3f}s'.format(time.time() - start_time))

Total time: 0.584s


In [4]:
start_time = time.time()

for _ in range(100):
  # Creating on GPU directly
  cpu_tensor = torch.ones((1000, 64, 64), device='cuda')

print('Total time: {:.3f}s'.format(time.time() - start_time))

Total time: 0.009s


# 2. Use `Sequential` Layers When Possible

In [6]:
class ExampleModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.input_layer = nn.Linear(input_size, hidden_size)
    self.input_activation = nn.ReLU()

    self.mid_layer = nn.Linear(hidden_size, hidden_size)
    self.mid_activation = nn.ReLU()

    self.output_layer = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    z = self.input_layer(x)
    z = self.input_activation(z)
    
    z = self.mid_layer(z)
    z = self.mid_activation(z)
    
    out = self.output_layer(z)

    return out

In [7]:
example_model = ExampleModel()
print(example_model)
print('Output shape:', example_model(torch.ones([100, 2])).shape)

ExampleModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (mid_layer): Linear(in_features=16, out_features=16, bias=True)
  (mid_activation): ReLU()
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)
Output shape: torch.Size([100, 3])


In [9]:
class ExampleSequentialModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.layers = nn.Sequential(
      nn.Linear(input_size, hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size, hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size, output_size))

  def forward(self, x):
    out = self.layers(x)
    return out

In [10]:
example_seq_model = ExampleSequentialModel()
print(example_seq_model)
print('Output shape:', example_seq_model(torch.ones([100, 2])).shape)

ExampleSequentialModel(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=3, bias=True)
  )
)
Output shape: torch.Size([100, 3])


# 3. Don't Make Lists of Layers

In [11]:
class BadListModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.input_layer = nn.Linear(input_size, hidden_size)
    self.input_activation = nn.ReLU()

    # Fairly common when using residual layers
    self.mid_layers = []
    for _ in range(5):
      self.mid_layers.append(nn.Linear(hidden_size, hidden_size))
      self.mid_layers.append(nn.ReLU())

    self.output_layer = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    z = self.input_layer(x)
    z = self.input_activation(z)
    
    for layer in self.mid_layers:
      z = layer(z)
    
    out = self.output_layer(z)

    return out

In [12]:
bad_list_model = BadListModel()
print('Output shape:', bad_list_model(torch.ones([100, 2])).shape)

Output shape: torch.Size([100, 3])


In [13]:
gpu_input = torch.ones([100, 2], device='cuda')
gpu_bad_list_model = bad_list_model.cuda()
print('Output shape:', bad_list_model(gpu_input).shape)

RuntimeError: ignored

## Better Way to Do This

In [14]:
class CorrectListModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.input_layer = nn.Linear(input_size, hidden_size)
    self.input_activation = nn.ReLU()

    # Fairly common when using residual layers
    self.mid_layers = []
    for _ in range(5):
      self.mid_layers.append(nn.Linear(hidden_size, hidden_size))
      self.mid_layers.append(nn.ReLU())
    self.mid_layers = nn.Sequential(*self.mid_layers)

    self.output_layer = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    z = self.input_layer(x)
    z = self.input_activation(z)
    z = self.mid_layers(z)
    out = self.output_layer(z)

    return out

In [15]:
correct_list_model = CorrectListModel()
gpu_input = torch.ones([100, 2], device='cuda')
gpu_correct_list_model = correct_list_model.cuda()
print('Output shape:', correct_list_model(gpu_input).shape)

Output shape: torch.Size([100, 3])


# 4. Make Use of Distributions

In [16]:
# Setup
example_model = ExampleModel()
input_tensor = torch.rand(5, 2)
output = example_model(input_tensor)
print(output)

tensor([[ 0.1965,  0.0558, -0.2112],
        [ 0.2035,  0.0650, -0.2077],
        [ 0.2150,  0.0577, -0.2096],
        [ 0.1957,  0.0540, -0.2117],
        [ 0.2045,  0.0566, -0.2085]], grad_fn=<AddmmBackward>)


In [17]:
from torch.distributions import Categorical
from torch.distributions.kl import kl_divergence

In [18]:
dist = Categorical(logits=output)
dist

Categorical(logits: torch.Size([5, 3]))

In [19]:
# Get probabilities
dist.probs

tensor([[0.3946, 0.3428, 0.2625],
        [0.3947, 0.3437, 0.2616],
        [0.3986, 0.3406, 0.2607],
        [0.3947, 0.3426, 0.2627],
        [0.3962, 0.3417, 0.2621]], grad_fn=<SoftmaxBackward>)

In [24]:
# Take samples
dist.sample()

tensor([0, 1, 0, 0, 2])

In [25]:
# Calculate the KL-Divergence
dist_1 = Categorical(logits=output[0])
dist_2 = Categorical(logits=output[1])
kl_divergence(dist_1, dist_2)

tensor(2.5076e-06, grad_fn=<SumBackward1>)

# 5. Use `detach()` On Long-Term Metrics

In [27]:
# Setup
example_model = ExampleModel()
data_batches = [torch.rand((10, 2)) for _ in range(5)]
criterion = nn.MSELoss(reduce='mean')



## Bad Example

In [28]:
losses = []

# Training loop
for batch in data_batches:
  output = example_model(batch)

  target = torch.rand((10, 3))
  loss = criterion(output, target)
  losses.append(loss)

  # Optimization happens here

print(losses)

[tensor(0.4718, grad_fn=<MseLossBackward>), tensor(0.5156, grad_fn=<MseLossBackward>), tensor(0.6583, grad_fn=<MseLossBackward>), tensor(0.4429, grad_fn=<MseLossBackward>), tensor(0.4133, grad_fn=<MseLossBackward>)]


## Better Example

In [31]:
losses = []

# Training loop
for batch in data_batches:
  output = example_model(batch)

  target = torch.rand((10, 3))
  loss = criterion(output, target)
  losses.append(loss.item()) # Or `loss.item()`

  # Optimization happens here

print(losses)

[0.5439911484718323, 0.5461570620536804, 0.6738904118537903, 0.5780249834060669, 0.5130327939987183]


# 6. Trick to Delete a Model from GPU

In [32]:
import gc

In [33]:
example_model = ExampleModel().cuda()

del example_model

gc.collect()
# The model will normally stay on the cache until something takes it's place
torch.cuda.empty_cache()

# 7. Call `eval()` Before Testing

In [34]:
example_model = ExampleModel()

# Do training

example_model.eval()

# Do testing

example_model.train()

# Do training again

ExampleModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (mid_layer): Linear(in_features=16, out_features=16, bias=True)
  (mid_activation): ReLU()
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)

### Affects
  - Dropout
  - Batch Normalization
  - RNNs
  - Lazy Variants

source: https://stackoverflow.com/questions/66534762/which-pytorch-modules-are-affected-by-model-eval-and-model-train