In [1]:
import time
import pandas as pd
from common import utils
utils.get_gpu_info()

{'0000:07:00.0,Tesla V100-DGXS-16GB': 'computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s',
 '0000:08:00.0,Tesla V100-DGXS-16GB': 'computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s',
 '0000:0e:00.0,Tesla V100-DGXS-16GB': 'computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s',
 '0000:0f:00.0,Tesla V100-DGXS-16GB': 'computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s'}

In [14]:
_ = utils.run_command("nvidia-smi nvlink -sc 0bz")

# DL Benchmarks Runner

* Configuration: Computer vision, <10GB VRAM

* Adjust `threads = XX` below according to your CPU (suggested to leave 1C/2T for OS/background tasks)

In [3]:
threads = 30

In [4]:
threads = str(int(threads))
start_time = time.time()

## Image Classification (ResNet-50 MLPerf model)

ResNet-50 identical to the MLPerf reference implementation and is one of the most common benchmarks performed.

**No image augmentation (GPU limited)**

In [5]:
exp_name = "rn50"
print("Running:", exp_name)
results = utils.run_command("python3 run_cnn.py --threads "+threads+" --batch_size 64")
if results[-1].split(",")[0] == "PASS":
    rn50_10gb = exp_name + "," + results[-1]
    print(rn50_10gb)
else:
    print(exp_name, "FAIL")

Running: rn50
rn50,PASS,2674.6,64,41,4,0.7,55,167,49,9.7,3.2,[]


**Image augmentation used (CPU limited)**

In [6]:
exp_name = "rn50_imgaug"
print("Running:", exp_name)
results = utils.run_command("python3 run_cnn.py --threads "+threads+" --batch_size 64 --img_aug")
if results[-1].split(",")[0] == "PASS":
    rn50_imgaug_10gb = exp_name + "," + results[-1]
    print(rn50_imgaug_10gb)
else:
    print(exp_name, "FAIL")

Running: rn50_imgaug
rn50_imgaug,PASS,2534.3,62,39,5,0.8,53,161,48,9.7,3.0,[]


## Image Classification (CNN Stress-test Model)

Large, compute heavy CNN to perform a stress-test of the GPU compute.

In [7]:
exp_name = "hugecnn-10gb"
print("Running:", exp_name)
results = utils.run_command("python3 run_cnn.py --threads "+threads+" --batch_size 64 --huge_cnn")
if results[-1].split(",")[0] == "PASS":
    hugecnn_10gb = exp_name + "," + results[-1]
    print(hugecnn_10gb)
else:
    print(exp_name, "FAIL")

Running: hugecnn-10gb
hugecnn-10gb,PASS,2001.3,90,34,13,2.1,77,232,53,9.7,14.7,[]


In [8]:
end_time = time.time()
print("Total time taken:", int(end_time-start_time), "seconds.")

Total time taken: 599 seconds.


# Export Results

Values reported are for GPU 0 (first GPU), except for NVLink which is average TX+RX across all GPUs

In [11]:
cols = ["name", "passed", "avg_fps", "avg_sm_%", "avg_mem_io_%", "avg_pcie_%", "pcie_gbps", "avg_pwr_%", "pwr_watts", "avg_temp", "max_vram", "avg_nvlink", "throttle"]
df = pd.DataFrame([rn50_10gb.split(","), rn50_imgaug_10gb.split(","), hugecnn_10gb.split(",")], 
                  columns=cols) 

In [12]:
df.head()

Unnamed: 0,name,passed,avg_fps,avg_sm_%,avg_mem_io_%,avg_pcie_%,pcie_gbps,avg_pwr_%,pwr_watts,avg_temp,max_vram,avg_nvlink,throttle
0,rn50,PASS,2674.6,64,41,4,0.7,55,167,49,9.7,3.2,[]
1,rn50_imgaug,PASS,2534.3,62,39,5,0.8,53,161,48,9.7,3.0,[]
2,hugecnn-10gb,PASS,2001.3,90,34,13,2.1,77,232,53,9.7,14.7,[]
