# Chapter 13. Computational Performance

* This chapter will focus on the major factors that affect computational performance: imperative programming, symbolic programming, asynchronous computing, automatic parallelism, and multi-GPU computation.

## Chapter 13.1 Compilers and Interpreters

In [6]:
def add(a, b):
    return a + b

def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)

    return g

print(fancy_func(1, 2, 3, 4))

10


In [7]:
import torch
from torch import nn
from d2l import torch as d2l

# Factory for networks
def get_net():
    net = nn.Sequential(nn.Linear(512, 256),
                       nn.ReLU(),
                       nn.Linear(256, 128),
                       nn.ReLU(),
                       nn.Linear(128, 2))
    return net

x = torch.randn(size=(1, 512))
net = get_net()
net(x)

tensor([[0.0410, 0.2545]], grad_fn=<AddmmBackward0>)

In [8]:
# By converting the model using torch.jit.script function,
# we are able to compile and optimize the computation in the MLP.

net = torch.jit.script(net)
net(x)

tensor([[0.0410, 0.2545]], grad_fn=<AddmmBackward0>)

In [9]:
# Benchmark to compare with or without jitscript

class Benchmark:
    """For measuring running time."""

    def __init__(self, description='Done'):
        self.description = description

    def __enter__(self):
        self.timer = d2l.Timer()
        return self

    def __exit__(self, *args):
        print(f'{self.description}: {self.timer.stop():.4f} sec')


In [10]:
net = get_net()

with Benchmark('Without torchscript'):
    for i in range(3000):
        net(x)

net = torch.jit.script(net)

with Benchmark('With torchscript'):
    for i in range(3000):
        net(x)


Without torchscript: 0.2719 sec
With torchscript: 0.2697 sec


In [None]:
net.save('my_mlp')
!ls -lh my_mlp*

-rw-r--r--  1 yfzhu  staff   651K Jan 29 11:47 my_mlp


## Chapter 13.2 Asynchronous Computation

In [None]:
# Installation of d2l packages

!pip install torch==2.0.0 torchvision==0.15.1
!pip install d2l==1.0.3

In [11]:
import os
import subprocess
import numpy
import torch
from torch import nn
from d2l import torch as d2l


In [14]:
# Warmup for GPU computation
device = d2l.try_gpu()
a = torch.randn(size=(1000, 1000), device=device)
b = torch.mm(a, a)

with d2l.Benchmark('numpy'):
    for _ in range(10):
        a = numpy.random.normal(size=(1000, 1000))
        b = numpy.dot(a, a)

with d2l.Benchmark('torch'):
    for _ in range(10):
        a = torch.randn(size=(1000, 1000), device=device)
        b = torch.mm(a, a)
    torch.cuda.synchronize(device)

numpy: 1.4007 sec
torch: 0.0085 sec


In [16]:
# Another example  to understand the dependency graph.

x = torch.ones((1, 2), device=device)
y = torch.ones((1, 2), device=device)
z = x * y + 2
z

tensor([[3., 3.]], device='cuda:0')

## Chapter 13.3 Automatci Parallelism


In [19]:
devices = d2l.try_all_gpus()

def run(x):
  return [x.mm(x) for _ in range(50)]

x_gpu1 = torch.rand(size=(4000, 4000), device=devices[0])
x_gpu2 = torch.rand(size=(4000, 4000), device=devices[1])

In [21]:
run(x_gpu1)
run(x_gpu2)

torch.cuda.synchronize(devices[0])
torch.cuda.synchronize(devices[1])

with d2l.Benchmark('GPU 1 time'):
  run(x_gpu1)
  torch.cuda.synchronize(devices[0])

with d2l.Benchmark('GPU 2 time'):
  run(x_gpu2)
  torch.cuda.synchronize(devices[1])



GPU 1 time: 1.9003 sec
GPU 2 time: 1.9108 sec


In [22]:
with d2l.Benchmark('GPU1 & GPU2'):
  run(x_gpu1)
  run(x_gpu2)
  torch.cuda.synchronize()



GPU1 & GPU2: 3.6658 sec


In [25]:
def copy_to_cpu(x, non_blocking=False):
  return [y.to('cpu', non_blocking=non_blocking) for y in x]

with d2l.Benchmark('Run on GPU1'):
  y = run(x_gpu1)
  torch.cuda.synchronize()

with d2l.Benchmark('Run on CPU'):
  y_cpu = copy_to_cpu(y)
  torch.cuda.synchronize()

Run on GPU1: 1.9252 sec
Run on CPU: 3.2244 sec


In [26]:
with d2l.Benchmark('Run on GPU1 and copy to CPU'):
  y = run(x_gpu1)
  y_cpu = copy_to_cpu(y, True)
  torch.cuda.synchronize()

Run on GPU1 and copy to CPU: 2.6150 sec


## Chapter 13.4 Hardware

In [28]:
!nvidia-smi

Mon Jan 29 04:48:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0              35W /  70W |   6503MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    