## Python版

元のヤツをgit clone。  
neuralnet_mnist_int.pyをコピー。

```
git submodule add https://github.com/oreilly-japan/deep-learning-from-scratch.git
```

In [None]:
import neuralnet_mnist_int
import numpy as np

In [None]:
# neuralnet_mnist_int.main()

In [None]:
# x_test, y_test = neuralnet_mnist_int.get_data()

In [None]:
mode = "INT_MODE"
x_test, y_test = neuralnet_mnist_int.get_data(mode=mode)
network = neuralnet_mnist_int.init_network(mode=mode)
py_y_test = np.ndarray(len(x_test), dtype=np.uint8)


In [None]:
%%time
## Python版の結果
for i in range(len(x_test)):
    y,_ = neuralnet_mnist_int.predict(network, x_test[i], mode=mode)
    py_y_test[i] = np.argmax(y) # 最も確率の高い要素のインデックスを取得

In [None]:
py_y_test

## FPGA版 全データ

In [None]:
## FPGA Load
from pynq import Overlay
OL = Overlay("/home/xilinx/pynq/overlays/my_design/test_mnist_wrapper.bit")
OL.download()
XLNK = OL.processing_system7_0

## show IPs
print(OL.ip_dict.keys(), XLNK)

In [None]:
# import mnist
# import numpy as np
# x_test = mnist.test_images()
# y_test = mnist.test_labels()
# x_test = x_test.reshape(10000, 28*28)

import neuralnet_mnist_int
import numpy as np
x_test, y_test = neuralnet_mnist_int.get_data()
x_test = x_test.astype(np.uint8)

In [None]:
type(x_test), x_test.shape, x_test.dtype

In [None]:
## Allocate Memory
IMAGE_NUM = x_test.shape[0]
# IMAGE_NUM = 30
print(IMAGE_NUM)
input_buf = XLNK.cma_array([28*28*IMAGE_NUM], np.uint8)
print(hex(input_buf.physical_address))
output_buf = XLNK.cma_array([1*IMAGE_NUM], np.uint8)
print(hex(output_buf.physical_address))

In [None]:
## Write output_buf
for i in range(IMAGE_NUM):
    output_buf[i] = 0xFF
output_buf[0]

In [None]:
# %%time
# adr  = 0
# for i in range(0, IMAGE_NUM):
#     for t in test_x[i]:
#         input_buf[adr] = t
#         adr += 1
#     print(f"\r i={i}", end='')
# print()

In [None]:
## Write inpu_buf(DDR)
XLNK.cma_memcopy(input_buf, x_test, 28*28*IMAGE_NUM)

In [None]:
# N = 0
# for i in input_buf[(28*28)*N:(28*28)*(N+1)]:
#     print(f"{i:02X}")

In [None]:
def wait_dma():
    i = 0
    while i<100:
        st = OL.axi_dma.register_map.S2MM_DMASR.Idle
        if st:
            break
        i += 1
        print(f"\rWait for Idle: {i}", end='')
    print()


In [None]:
%%time
## DMA Control
## Stop
OL.axi_dma.register_map.MM2S_DMACR = 0x0
OL.axi_dma.register_map.S2MM_DMACR = 0x0

## Run
OL.axi_dma.register_map.MM2S_DMACR = 0x1
OL.axi_dma.register_map.S2MM_DMACR = 0x1

## Address
OL.axi_dma.register_map.MM2S_SA = input_buf.physical_address
OL.axi_dma.register_map.S2MM_DA = output_buf.physical_address

## Size
# OL.axi_dma.register_map.MM2S_LENGTH = 28*28*20
# OL.axi_dma.register_map.S2MM_LENGTH = 1*20
OL.axi_dma.register_map.MM2S_LENGTH = 28*28*IMAGE_NUM
OL.axi_dma.register_map.S2MM_LENGTH = 1*IMAGE_NUM

wait_dma()

In [None]:
## DMA MM2S Status
OL.axi_dma.register_map.MM2S_DMASR, OL.axi_dma.register_map.S2MM_DMASR 

In [None]:
hex(OL.axi_dma.register_map.MM2S_LENGTH), hex(28*28*IMAGE_NUM), hex(28*28*21), hex(28*28*10000)

In [None]:
N = IMAGE_NUM
ok = 0
for i, (exp, data) in enumerate(zip(y_test, output_buf)):
    if exp==data:
        ok += 1
    ## print(f"{i}: {exp:02X}, {data:02X}, {exp==data}")
print(ok/N)

In [None]:
for i, (exp, data) in enumerate(zip(py_y_test, output_buf)):
    if exp!=data:
        print(f"{i}, {exp} {data}")

## 他

In [None]:
## まとめ用
import re

TIME_GET = ["user", "sys", "total", "Wall time"]

def get_sec(value, unit):
    value = float(value)
    if unit=="µs":
        value *= 10**(-6)
    elif unit=="ms":
        value *= 10**(-3)
    return value

def time_formatter_sec(s):
    res = {}
    for tg in TIME_GET:
        m = re.search(f"{tg}.*?([0-9.]+)\s([µm]*s)", s)
        ## print(m)
        res[tg] = get_sec(m.group(1), m.group(2))
    return res

In [None]:
python_result = """
CPU times: user 36.4 s, sys: 15.4 ms, total: 36.4 s
Wall time: 36.5 s
"""

rtl_result = """
CPU times: user 191 ms, sys: 50.7 ms, total: 242 ms
Wall time: 246 ms
"""

python_time = time_formatter_sec(python_result)
rtl_time = time_formatter_sec(rtl_result)

for tg in TIME_GET:
    print(f"{tg} | {python_time[tg]} | {rtl_time[tg]} | {python_time[tg] / rtl_time[tg]:.1f}")

## まとめ

| Python版 | RTL版 | 倍率
-- | -- | --  
user | 36.4 | 0.191 | 190.6
sys | 0.0154 | 0.0507 | 0.3
total | 36.4 | 0.242 | 150.4
Wall time | 36.5 | 0.246 | 148.4

実行時間で、約150倍。