# 测试8个GPU之间的HCCL通信带宽的脚本

- **平均测试时间**：是指某一算法先做m次预热，再做n次测试，并从n次测试开始计时，计算算法完成时平均每次迭代的耗时

- **算法带宽**：是指申请内存大小/平均时间的数据，包含数据传输、计算和内存复制的带宽

## I. 导入相关的包并定义初始化的变量

In [None]:
import os, re, time, json, stat

# 定义根目录
root_path = "/root/Workdir/hccl_test"
# 定义算法的名字和对应的绘图的颜色
test_func = [
    #"all_gather",
    #"all_reduce",
    #"alltoall",
    #"alltoallv",
    "broadcast",
    #"reduce",
    #"reduce_scatter",
]
# 其实内存容量
test_mem_start = 2048*1024*1024
# 结束的内存容量
test_mem_stop = 2048*1024*1024
# 增长倍数
test_mem_factor = 2
# 指定参与运算的NPU卡号
npu_list = ["0", "1", "2", "3", "4", "5", "6", "7"]


## II. 进行HCCL测试

### 2.1 清理工作环境

In [None]:
os.chdir(root_path)
os.system("make clean")
os.system("make")
print(os.getcwd())

os.system(f"rm -rf {root_path}/log")
os.makedirs(f"{root_path}/log/data")
os.makedirs(f"{root_path}/log/prof")

In [None]:
mem_list = []
mem_size = test_mem_start
while mem_size <= test_mem_stop:
    for func in test_func:
        mem_list.append([func, mem_size])
    mem_size = mem_size * 2
mem_count = len(mem_list)
print(f"mem_list len: {mem_count}")

### 2.2 执行测试脚本

```text
-b,--minbytes <min size in bytes>
-e,--maxbytes <max size in bytes>
-i,--stepbytes <increment size>
-f,--stepfactor <increment factor>
-n,--iters <iteration count>
-o,--op <sum/prod/min/max>
-d,--datatype <int8/int16/int32/fp16/fp32/int64/uint64/uint8/uint16/uint32/fp64/bfp16>
-r,--root <root>
-w,--warmup_iters <warmup iteration count>
-c,--check <result verification> 0:disabled 1:enabled.
-p,--npus <npus used for one node>
-h,--help
```

In [None]:
def get_size(size):
    units = ['B', 'kB', 'MB', 'GB', 'TB']
    unit_index = 0
    while size > 1024:
        size = size / 1024.00
        unit_index += 1
    return f'{size}{units[unit_index]}'

def get_script(func, mem_size):
    npus = len(npu_list)
    run_file = f"{root_path}/log/temp_run"
    cmd = f"mpirun -n {npus} {root_path}/bin/{func}_test -b {mem_size} -e {mem_size} -p {npus} > {root_path}/log/data/{func}_all_npu.log"
    with open(f"{run_file}.sh", 'w+') as file:
        file.writelines(cmd)
    os.chmod(f'{run_file}.sh', stat.S_IRWXU)
    return run_file

def get_time(start_time):
    day = 0
    hour = 0
    minute = 0
    second = 0
    second = int(time.time() - start_time)
    while second > 60:
        second = second - 60
        minute += 1
    while minute > 60:
        minute = minute - 60
        hour += 1
    while hour > 24:
        hour = hour - 24
        day += 1
    if day > 0:
        return f"{day} day {hour:02d}:{minute:02d}:{second:02d}"
    else:
        return f"{hour:02d}:{minute:02d}:{second:02d}"

def merge_json(src_path, dst_file):
    device_list = {}
    merge_timeline = []
    for dir in os.listdir(src_path):
        cur_device = "null"
        for device in os.listdir(f"{src_path}/{dir}"):
            if re.match(r"device_\d+", device):
                cur_device = device
                break
        timeline = os.listdir(f"{src_path}/{dir}/timeline")[-1]
        device_list[cur_device] = f"{src_path}/{dir}/timeline/{timeline}"
    for device in device_list:
        with open(device_list[device], 'r+') as file:
            timeline = json.load(file)
        for node in timeline:
            if "name" in node and node["name"] == "process_name":
                if "args" in node and "name" in node["args"]:
                    value = node["args"]["name"]
                    node["args"]["name"] = f"{device} {value}"
            elif "name" in node and node["name"] == "thread_name":
                if "args" in node and "name" in node["args"]:
                    value = node["args"]["name"]
                    node["args"]["name"] = f"{device} {value}"
            merge_timeline.append(node)
    with open(dst_file, 'w+') as file:
        json.dump(merge_timeline, file)
    return device_list


In [None]:
os.environ['HCCL_TEST_USE_DEVS'] = ",".join(npu_list)
start_time = time.time()
for i in range(len(mem_list)):
    [func, mem_size] = mem_list[i]
    prt_precent = f"{((i + 1.0) / mem_count * 100):02.2f}%"
    print(f'{i + 1:03d} / {len(mem_list):03d} ({prt_precent}) >> {func} in {get_size(mem_size)}...', end='')
    run_file = get_script(func, mem_size)
    log_path = f"{root_path}/log"
    os.system(f"rm -rf {log_path}/tmp")
    cmd = f"""msprof --application="{run_file}.sh" \
        --output="{log_path}/tmp" \
        --host-sys=cpu,mem,network \
        --sys-hardware-mem=on \
        --sys-cpu-profiling=on \
        --sys-profiling=on > {run_file}.log"""
    os.system(cmd)
    dst_path = f"{log_path}/prof/{func}_{get_size(mem_size)}"
    device_list = merge_json(f"{log_path}/tmp", f"{dst_path}.json")
    # if not os.path.exists(dst_path):
    #     os.makedirs(dst_path)
    # for device in device_list:
    #     os.system(f"cp {device_list[device]} {dst_path}/{device}.json")
    # os.system(f"rm -rf {log_path}/tmp")
    os.system(f"rm {run_file}.sh")
    os.system(f"rm {run_file}.log")
    print(f' {get_time(start_time)} done')
    break