<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/0518.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 8.1 命令式和符号式混合编程

In [2]:
def add(a, b):
    return a + b 

def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    return g 

fancy_func(1, 2, 3, 4)

10

In [3]:
def add_str():
    return '''
def add(a, b):
    return a + b 
''' 

def fancy_func_str():
    return ''' 
def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    return g 
''' 

def evoke_str():
    return add_str() + fancy_func_str() + ''' 
print(fancy_func(1, 2, 3, 4))
''' 

prog = evoke_str()
print(prog)
y = compile(prog, '', 'exec')
exec(y)


def add(a, b):
    return a + b 
 
def fancy_func(a, b, c, d):
    e = add(a, b)
    f = add(c, d)
    g = add(e, f)
    return g 
 
print(fancy_func(1, 2, 3, 4))

10


In [0]:
!pip install mxnet-cu100 d2lzh

In [5]:
from mxnet import nd, sym 
from mxnet.gluon import nn 
import time 

def get_net():
    net = nn.HybridSequential()
    net.add(nn.Dense(256, activation='relu'), 
        nn.Dense(128, activation='relu'), 
        nn.Dense(2))

    net.initialize()
    return net 

x = nd.random.normal(shape=(1, 512))
net = get_net()
net(x)


[[0.08827586 0.0050518 ]]
<NDArray 1x2 @cpu(0)>

In [6]:
net.hybridize()
net(x)


[[0.08827586 0.0050518 ]]
<NDArray 1x2 @cpu(0)>

In [7]:
def benchmark(net, x):
    start = time.time()
    for i in range(10000):
        _ = net(x)
    nd.waitall()
    return time.time() - start 

net = get_net()
print('before: %.4f sec' % (benchmark(net, x)))
net.hybridize()
print('after: %.4f sec' % (benchmark(net, x)))

before: 6.2879 sec
after: 2.5578 sec


In [0]:
net.export('my_mlp')

In [9]:
x = sym.var('data')
net(x)

<Symbol dense5_fwd>

In [0]:
class HybridNet(nn.HybridBlock):
    def __init__(self, **kwargs):
        super(HybridNet, self).__init__(**kwargs)
        self.hidden = nn.Dense(10)
        self.output = nn.Dense(2)

    def hybrid_forward(self, F, x):
        print('F: ', F)
        print('x: ', x)
        x = F.relu(self.hidden(x))
        print('hidden: ', x)
        return self.output(x)

In [11]:
net = HybridNet()
net.initialize()
x = nd.random.normal(shape=(1, 4))
net(x)

F:  <module 'mxnet.ndarray' from '/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/__init__.py'>
x:  
[[-0.12225834  0.5429998  -0.9469352   0.59643304]]
<NDArray 1x4 @cpu(0)>
hidden:  
[[0.11134676 0.04770704 0.05341475 0.         0.08091211 0.
  0.         0.04143535 0.         0.        ]]
<NDArray 1x10 @cpu(0)>



[[0.00370749 0.00134991]]
<NDArray 1x2 @cpu(0)>

In [12]:
net(x)

F:  <module 'mxnet.ndarray' from '/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/__init__.py'>
x:  
[[-0.12225834  0.5429998  -0.9469352   0.59643304]]
<NDArray 1x4 @cpu(0)>
hidden:  
[[0.11134676 0.04770704 0.05341475 0.         0.08091211 0.
  0.         0.04143535 0.         0.        ]]
<NDArray 1x10 @cpu(0)>



[[0.00370749 0.00134991]]
<NDArray 1x2 @cpu(0)>

In [13]:
net.hybridize()
net(x)

F:  <module 'mxnet.symbol' from '/usr/local/lib/python3.6/dist-packages/mxnet/symbol/__init__.py'>
x:  <Symbol data>
hidden:  <Symbol hybridnet0_relu0>



[[0.00370749 0.00134991]]
<NDArray 1x2 @cpu(0)>

In [14]:
net(x)


[[0.00370749 0.00134991]]
<NDArray 1x2 @cpu(0)>

## 8.2 异步计算

In [0]:
from mxnet import autograd, gluon, nd 
from mxnet.gluon import loss as gloss, nn 
import os 
import subprocess 
import time 

In [16]:
a = nd.ones((1, 2))
b = nd.ones((1, 2))
c = a * b + 2 
c


[[3. 3.]]
<NDArray 1x2 @cpu(0)>

In [0]:
class Benchmark():
    def __init__(self, prefix=None):
        self.prefix = prefix + ' ' if prefix else ''

    def __enter__(self):
        self.start = time.time()

    def __exit__(self, *args):
        print('%stime: %.4f sec' % (self.prefix, time.time() - self.start))

In [18]:
with Benchmark('Workloads are queued.'):
    x = nd.random.uniform(shape=(2000, 2000))
    y = nd.dot(x, x).sum()

with Benchmark('Workloads are finished.'):
    print('sum=', y)

Workloads are queued. time: 0.0013 sec
sum= 
[1.9998138e+09]
<NDArray 1 @cpu(0)>
Workloads are finished. time: 0.4028 sec


In [19]:
with Benchmark():
    y = nd.dot(x, x)
    y.wait_to_read()

time: 0.2891 sec


In [20]:
with Benchmark():
    y = nd.dot(x, x)
    z = nd.dot(x, x)
    nd.waitall()

time: 0.5709 sec


In [21]:
with Benchmark():
    y = nd.dot(x,x)
    y.asnumpy()

time: 0.2777 sec


In [22]:
with Benchmark():
    y = nd.dot(x, x)
    y.norm().asscalar()

time: 0.3418 sec


In [23]:
with Benchmark('syn.'):
    for _ in range(1000):
        y = x + 1 
        y.wait_to_read()

with Benchmark('asyn.'):
    for _ in range(1000):
        y = x + 1 
    nd.waitall()

syn. time: 5.1978 sec
asyn. time: 5.1456 sec


In [0]:
def data_iter():
    start = time.time()
    num_batches, batch_size = 100, 1024 
    for i in range(num_batches):
        x = nd.random.normal(shape=(batch_size, 512))
        y = nd.ones((batch_size, ))
        yield x, y 
        if (i + 1) % 50 == 0:
            print('batch %d, time %f sec' % (i + 1, time.time() - start))

In [0]:
net = nn.Sequential()
net.add(nn.Dense(2048, activation='relu'), 
    nn.Dense(512, activation='relu'), 
    nn.Dense(1))
net.initialize()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.005})
loss = gloss.L2Loss()

In [0]:
def get_mem():
    res = subprocess.check_output(['ps', 'u', '-p', str(os.getpid())])
    return int(str(res).split()[15]) / 1e3 

In [0]:
for X, y in data_iter():
    break 
loss(y, net(X)).wait_to_read()

In [28]:
l_sum, mem = 0, get_mem()
for X, y in data_iter():
    with autograd.record():
        l = loss(y, net(X))
    l_sum += l.mean().asscalar()
    l.backward()
    trainer.step(X.shape[0])
nd.waitall()
print('increased memory: %f MB' %(get_mem() - mem))

batch 50, time 13.615280 sec
batch 100, time 27.390618 sec
increased memory: 11.188000 MB


In [29]:
mem = get_mem()
for X, y in data_iter():
    with autograd.record():
        l = loss(y, net(X))
    l.backward()
    trainer.step(X.shape[0])
nd.waitall()
print('increased memory: %f MB' % (get_mem() - mem))

batch 50, time 0.131212 sec
batch 100, time 0.273308 sec
increased memory: 0.004000 MB


## 8.3 并行计算

In [0]:
import d2lzh as d2l 
import mxnet as mx 
from mxnet import nd 

In [0]:
def run(x):
    return [nd.dot(x, x) for _ in range(10)]

In [0]:
x_cpu = nd.random.uniform(shape=(2000, 2000))
x_gpu = nd.random.uniform(shape=(6000, 6000), ctx=mx.gpu(0))

In [33]:
run(x_cpu)
run(x_gpu)
nd.waitall()

with d2l.Benchmark('Run on CPU.'):
    run(x_cpu)
    nd.waitall()

with d2l.Benchmark('Run on GPU.'):
    run(x_gpu)
    nd.waitall()

Run on CPU. time: 2.5805 sec
Run on GPU. time: 1.8481 sec


In [34]:
with d2l.Benchmark('Run on CPU and GPU in parallel.'):
    run(x_cpu)
    run(x_gpu)
    nd.waitall()

Run on CPU and GPU in parallel. time: 3.3715 sec


In [35]:
def copy_to_cpu(x):
    return [y.copyto(mx.cpu()) for y in x]

with d2l.Benchmark('Run on GPU.'):
    y = run(x_gpu)
    nd.waitall()

with d2l.Benchmark('Then copy to CPU.'):
    copy_to_cpu(y)
    nd.waitall()

Run on GPU. time: 1.8620 sec
Then copy to CPU. time: 0.4153 sec


In [36]:
with d2l.Benchmark('Run and copy in parallel.'):
    y = run(x_gpu)
    copy_to_cpu(y)
    nd.waitall()

Run and copy in parallel. time: 1.9614 sec


## 8.4 多 GPU 计算

In [37]:
!nvidia-smi

Wed Nov 13 07:53:25 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.50       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    71W / 149W |   1934MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
import d2lzh as d2l 
import mxnet as mx 
from mxnet.gluon import loss as gloss 
from mxnet import autograd, nd 
import time 

In [0]:
scale = 0.01 
W1 = nd.random.normal(scale=scale, shape=(20, 1, 3, 3))
b1 = nd.zeros(shape=20)
W2 = nd.random.normal(scale=scale, shape=(50, 20, 5, 5))
b2 = nd.zeros(shape=50)
W3 = nd.random.normal(scale=scale, shape=(800, 128))
b3 = nd.zeros(shape=128)
W4 = nd.random.normal(scale=scale, shape=(128, 10))
b4 = nd.zeros(shape=10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]

In [0]:
def lenet(X, params):
    h1_conv = nd.Convolution(data=X, weight=params[0], bias=params[1], kernel=(3, 3), num_filter=20)
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data=h1_activation, pool_type='avg', kernel=(2, 2), stride=(2, 2))
    
    h2_conv = nd.Convolution(data=h1, weight=params[2], bias=params[3], kernel=(5, 5), num_filter=50)
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation, pool_type='avg', kernel=(2, 2), stride=(2, 2))
    h2 = nd.flatten(h2)
    h3_linear = nd.dot(h2, params[4]) + params[5]
    h3 = nd.relu(h3_linear)
    y_hat = nd.dot(h3, params[6]) + params[7]
    return y_hat 

loss = gloss.SoftmaxCrossEntropyLoss()

In [0]:
def get_params(params, ctx):
    new_params = [p.copyto(ctx) for p in params]
    for p in new_params:
        p.attach_grad()
    return new_params 

In [42]:
new_params = get_params(params, mx.gpu(0))
print('b1 weight:', new_params[1])
print('b1 grad:', new_params[1].grad)

b1 weight: 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 20 @gpu(0)>
b1 grad: 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 20 @gpu(0)>


In [0]:
def allreduce(data):
    for i in range(1, len(data)):
        data[0][:] += data[i].copyto(data[0].context)
    for i in range(1, len(data)):
        data[0].copyto(data[i])

In [44]:
data = [nd.ones((1, 2), ctx=mx.gpu(i)) * (i + 1) for i in range(1)]
print('before: ', data)
allreduce(data)
print('after: ', data)

before:  [
[[1. 1.]]
<NDArray 1x2 @gpu(0)>]
after:  [
[[1. 1.]]
<NDArray 1x2 @gpu(0)>]


In [0]:
def split_and_load(data, ctx):
    n, k = data.shape[0], len(ctx)
    m = n // k 
    assert m * k == n, '# examples is not divided by # devices.'
    return [data[i * m: (i + 1) * m].as_in_context(ctx[i]) for i in range(k)]

In [46]:
batch = nd.arange(24).reshape((6, 4))
ctx = [mx.gpu(0)]
splitted = split_and_load(batch, ctx)
print('input: ', batch)
print('load into', ctx)
print('output:', splitted)

input:  
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]
 [16. 17. 18. 19.]
 [20. 21. 22. 23.]]
<NDArray 6x4 @cpu(0)>
load into [gpu(0)]
output: [
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]
 [16. 17. 18. 19.]
 [20. 21. 22. 23.]]
<NDArray 6x4 @gpu(0)>]


In [0]:
def train_batch(X, y, gpu_params, ctx, lr):
    gpu_Xs, gpu_ys = split_and_load(X, ctx), split_and_load(y, ctx)
    with autograd.record():
        ls = [loss(lenet(gpu_X, gpu_W), gpu_y)
        for gpu_X, gpu_y, gpu_W in zip(gpu_Xs, gpu_ys, gpu_params)]
    for l in ls:
        l.backward()
    for i in range(len(gpu_params[0])):
        allreduce([gpu_params[c][i].grad for c in range(len(ctx))])
    for param in gpu_params:
        d2l.sgd(param, lr, X.shape[0])

In [0]:
def train(num_gpus, batch_size, lr):
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    print('running on:', ctx)
    gpu_params = [get_params(params, c) for c in ctx]
    for epoch in range(4):
        start = time.time()
        for X, y in train_iter:
            train_batch(X, y, gpu_params, ctx, lr)
            nd.waitall()
        train_time = time.time() - start 

        def net(x):
            return lenet(x, gpu_params[0])
        test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0])
        print('epoch %d, time %.1f sec, test acc %.2f' % (epoch + 1, train_time, test_acc))

In [49]:
train(num_gpus=1, batch_size=256, lr=0.2)

running on: [gpu(0)]
epoch 1, time 8.8 sec, test acc 0.10
epoch 2, time 8.6 sec, test acc 0.55
epoch 3, time 8.7 sec, test acc 0.70
epoch 4, time 8.8 sec, test acc 0.72


## 8.5 多 GPU 计算的简洁实现

In [0]:
import d2lzh as d2l 
import mxnet as mx 
from mxnet import autograd, gluon, init, nd 
from mxnet.gluon import loss as gloss, utils as gutils, nn 
import time 

In [0]:
def resnet18(num_classes):
    def resnet_block(num_channels, num_residuals, first_block=False):
        blk = nn.Sequential()
        for i in range(num_residuals):
            if i == 0 and not first_block:
                blk.add(d2l.Residual(num_channels, use_1x1conv=True, strides=2))
            else:
                blk.add(d2l.Residual(num_channels))
        return blk 

    net = nn.Sequential()
    net.add(nn.Conv2D(64, kernel_size=3, strides=1, padding=1), 
        nn.BatchNorm(), nn.Activation('relu'))
    net.add(resnet_block(64, 2, first_block=True), 
        resnet_block(128, 2), 
        resnet_block(256, 2), 
        resnet_block(512, 2))
    net.add(nn.GlobalAvgPool2D(), nn.Dense(num_classes))
    return net 

net = resnet18(10)

In [0]:
ctx = [mx.gpu(0)]
net.initialize(init=init.Normal(sigma=0.01), ctx=ctx)

In [53]:
x = nd.random.uniform(shape=(4, 1, 28, 28))
gpu_x = gutils.split_and_load(x, ctx)
net(gpu_x[0])


[[-5.7465440e-06 -3.3317165e-06 -1.3032401e-06 -1.3137640e-06
  -3.0325145e-06 -5.3874373e-06 -4.6628379e-06  1.3724061e-06
  -2.9884850e-06  2.7169640e-06]
 [-6.0186107e-06 -3.9834722e-06 -1.0086044e-06 -1.3575535e-06
  -2.4883661e-06 -5.1102088e-06 -5.1461625e-06  1.0052943e-06
  -2.8755123e-06  2.1098490e-06]
 [-5.5376263e-06 -3.8441981e-06 -1.0446329e-06 -8.3044563e-07
  -3.0834099e-06 -4.9034970e-06 -4.6071827e-06  6.2406548e-07
  -2.4988947e-06  2.3911816e-06]
 [-6.0026546e-06 -3.7010204e-06 -9.8344708e-07 -9.2437887e-07
  -2.3985406e-06 -5.5074138e-06 -4.9411633e-06  1.3234688e-06
  -3.3002682e-06  3.2781813e-06]]
<NDArray 4x10 @gpu(0)>

In [54]:
weight = net[0].params.get('weight')

try:
    weight.data()
except RuntimeError:
    print('not initialized on', mx.cpu())
weight.data(ctx[0])[0]


[[[-0.00481301  0.01521761 -0.00545954]
  [ 0.01197755 -0.00073117  0.00322847]
  [-0.00564734 -0.00043122  0.00359513]]]
<NDArray 1x3x3 @gpu(0)>

In [0]:
def train(num_gpus, batch_size, lr):
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    print('running on:', ctx)
    net.initialize(init=init.Normal(sigma=0.01), ctx=ctx, force_reinit=True)
    trainer = gluon.Trainer(
        net.collect_params(), 'sgd', {'learning_rate': lr})
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(4):
        start = time.time()
        for X, y in train_iter:
            gpu_Xs = gutils.split_and_load(X, ctx)
            gpu_ys = gutils.split_and_load(y, ctx)
            with autograd.record():
                ls = [loss(net(gpu_X), gpu_y)
                      for gpu_X, gpu_y in zip(gpu_Xs, gpu_ys)]
            for l in ls:
                l.backward()
            trainer.step(batch_size)
        nd.waitall()
        train_time = time.time() - start
        test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0])
        print('epoch %d, time %.1f sec, test acc %.2f' % (
            epoch + 1, train_time, test_acc))

In [56]:
train(num_gpus=1, batch_size=256, lr=0.1)

running on: [gpu(0)]
epoch 1, time 91.7 sec, test acc 0.86
epoch 2, time 84.9 sec, test acc 0.91
epoch 3, time 84.7 sec, test acc 0.90
epoch 4, time 84.9 sec, test acc 0.93


In [0]:
def train(num_gpus, batch_size, lr):
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    print('running on:', ctx)
    net.initialize(init=init.Normal(sigma=0.01), ctx=ctx, force_reinit=True)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(4):
        start = time.time()
        for X, y in train_iter:
            gpu_Xs = gutils.split_and_load(X, ctx)
            gpu_ys = gutils.split_and_load(y, ctx)
            with autograd.record():
                ls = [loss(net(gpu_X), gpu_y) for gpu_X, gpu_y in zip(gpu_Xs, gpu_ys)]
            for l in ls:
                l.backward()
            trainer.step(batch_size)
        nd.waitall()
        train_time = time.time() - start 
        test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0])
        print('epoch %d, time %.1f sec, test acc %.2f' %(epoch + 1, train_time, test_acc))
                

In [60]:
train(num_gpus=1, batch_size=256, lr=0.1)

running on: [gpu(0)]
epoch 1, time 85.9 sec, test acc 0.89
epoch 2, time 84.7 sec, test acc 0.92
epoch 3, time 84.5 sec, test acc 0.93
epoch 4, time 85.0 sec, test acc 0.91
