In [1]:
import torch
import torch_xla
import torch_xla.runtime




In [2]:
import torch_xla.debug.metrics as met
print(met.metrics_report())


Counter: RegisterXLAFunctions
  Value: 1



In [3]:
with torch_xla.runtime.xla_device():
  a = torch.randn(5, requires_grad=True)
  b = torch.randn(5, requires_grad=True)
  c = torch.randn(5, requires_grad=True)
  def f(a, b, c):
      prod_1 = a * b           # a and b are saved on GPU
      prod_2 = prod_1 * c  # prod_1 and c are saved on CPU
      y = prod_2 * a           # prod_2 and a are saved on GPU
      return y
  y = f(a, b, c)
  del a, b, c  # for illustration only
  # the content of a, b, and prod_2 are still alive on GPU
  # the content of prod_1 and c only live on CPU
  y.sum().backward()  # all CPU tensors are moved back to GPU, for backward


In [4]:
import torch_xla.debug.metrics as met
print(met.metrics_report())



Metric: LazyTracing
  TotalSamples: 30
  Accumulator: 002ms719.177us
  ValueRate: 439ms935.185us / second
  Rate: 7659.51 / second
  Percentiles: 1%=000.520us; 5%=000.650us; 10%=001.000us; 20%=005.830us; 50%=013.940us; 80%=046.170us; 90%=125.910us; 95%=278.209us; 99%=779.780us
Metric: TensorToData
  TotalSamples: 1
  Accumulator: 131.260us
  Percentiles: 1%=131.260us; 5%=131.260us; 10%=131.260us; 20%=131.260us; 50%=131.260us; 80%=131.260us; 90%=131.260us; 95%=131.260us; 99%=131.260us
Counter: CreateXlaTensor
  Value: 19
Counter: DestroyLtcTensor
  Value: 12
Counter: DestroyXlaTensor
  Value: 12
Counter: RegisterXLAFunctions
  Value: 1
Counter: xla::_propagate_xla_data
  Value: 3
Counter: xla::add
  Value: 1
Counter: xla::clone
  Value: 3
Counter: xla::detach_copy
  Value: 3
Counter: xla::empty_strided_symint
  Value: 1
Counter: xla::empty_symint
  Value: 4
Counter: xla::expand_copy_symint
  Value: 1
Counter: xla::fill_
  Value: 1
Counter: xla::mul
  Value: 9
Counter: xla::normal_
  Val

In [5]:
met.clear_all()
import torch_xla.debug.metrics as met
print(met.metrics_report())






In [6]:
with torch_xla.runtime.xla_device():
  a = torch.randn(5, requires_grad=True)
  b = torch.randn(5, requires_grad=True)
  c = torch.randn(5, requires_grad=True)
  def f(a, b, c):
      prod_1 = a * b           # a and b are saved on GPU
      with torch.autograd.graph.save_on_cpu():
          prod_2 = prod_1 * c  # prod_1 and c are saved on CPU
      y = prod_2 * a           # prod_2 and a are saved on GPU
      return y
  y = f(a, b, c)
  del a, b, c  # for illustration only
  # the content of a, b, and prod_2 are still alive on GPU
  # the content of prod_1 and c only live on CPU
  y.sum().backward()  # all CPU tensors are moved back to GPU, for backward
  # all intermediary tensors are released (deleted) after the call to backward

In [7]:
import torch_xla.debug.metrics as met
print(met.metrics_report())


Metric: DeviceLockWait
  TotalSamples: 4
  Accumulator: 033.620us
  ValueRate: 236.534us / second
  Rate: 28.1421 / second
  Percentiles: 1%=003.570us; 5%=003.570us; 10%=003.570us; 20%=003.570us; 50%=012.700us; 80%=013.510us; 90%=013.510us; 95%=013.510us; 99%=013.510us
Metric: IrValueTensorToXlaData
  TotalSamples: 2
  Accumulator: 162.920us
  ValueRate: 01s454ms123.527us / second
  Rate: 17850.8 / second
  Percentiles: 1%=043.180us; 5%=043.180us; 10%=043.180us; 20%=043.180us; 50%=119.740us; 80%=119.740us; 90%=119.740us; 95%=119.740us; 99%=119.740us
Metric: LazyTracing
  TotalSamples: 38
  Accumulator: 257ms024.902us
  ValueRate: 990ms222.824us / second
  Rate: 146.4 / second
  Percentiles: 1%=000.580us; 5%=000.600us; 10%=001.331us; 20%=007.250us; 50%=019.350us; 80%=066.069us; 90%=147.711us; 95%=121ms754.242us; 99%=135ms083.320us
Metric: TensorToData
  TotalSamples: 2
  Accumulator: 155.871us
  ValueRate: 01s381ms210.623us / second
  Rate: 17722.5 / second
  Percentiles: 1%=040.851us; 