# Chapter 13. Computational Performance

* This chapter will focus on the major factors that affect computational performance: imperative programming, symbolic programming, asynchronous computing, automatic parallelism, and multi-GPU computation. 

## Chapter 13.1 Compilers and Interpreters

In [1]:
def add(a, b):
    return a + b

def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    
    return g

print(fancy_func(1, 2, 3, 4))

10


In [2]:
import torch
from torch import nn
from d2l import torch as d2l

# Factory for networks
def get_net():
    net = nn.Sequential(nn.Linear(512, 256), 
                       nn.ReLU(), 
                       nn.Linear(256, 128), 
                       nn.ReLU(),
                       nn.Linear(128, 2))
    return net

x = torch.randn(size=(1, 512))
net = get_net()
net(x)

tensor([[ 0.0162, -0.0750]], grad_fn=<AddmmBackward0>)

In [3]:
# By converting the model using torch.jit.script function, 
# we are able to compile and optimize the computation in the MLP. 

net = torch.jit.script(net)
net(x)

tensor([[ 0.0162, -0.0750]], grad_fn=<AddmmBackward0>)

In [7]:
# Benchmark to compare with or without jitscript

class Benchmark:
    """For measuring running time."""
    
    def __init__(self, description='Done'):
        self.description = description
        
    def __enter__(self):
        self.timer = d2l.Timer()
        return self
    
    def __exit__(self, *args):
        print(f'{self.description}: {self.timer.stop():.4f} sec')
        

In [10]:
net = get_net()

with Benchmark('Without torchscript'):
    for i in range(3000):
        net(x)
        
net = torch.jit.script(net)

with Benchmark('With torchscript'):
    for i in range(3000):
        net(x)
        

Without torchscript: 0.4448 sec
With torchscript: 0.3914 sec


In [11]:
net.save('my_mlp')
!ls -lh my_mlp*

-rw-r--r--  1 yfzhu  staff   651K Jan 29 11:47 my_mlp


## Chapter 13.2 Asynchronous Computation

In [12]:
import os
import subprocess
import numpy
import torch
from torch import nn
from d2l import torch as d2l


In [14]:
# Warmup for GPU computation
device = d2l.try_gpu()
a = torch.randn(size=(1000, 1000), device=device)
b = torch.mm(a, a)

with d2l.Benchmark('numpy'):
    for _ in range(10):
        a = numpy.random.normal(size=(1000, 1000))
        b = numpy.dot(a, a)
        
with d2l.Benchmark('torch'):
    for _ in range(10):
        a = torch.randn(size=(1000, 1000), device=device)
        b = torch.mm(a, a)
    torch.cuda.synchronize(device)

numpy: 0.5679 sec
torch: 0.2431 sec


AssertionError: Torch not compiled with CUDA enabled