@@ -913,7 +913,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
913
913
914
914
for (int rank = 0 ; rank < gpu_count; rank++) {
915
915
if (SetGpu (rank)) {
916
- fprintf (stderr, " RunAllToAllBench::SetGpu for rank %d error: %d\n " , cuda_err, rank );
916
+ fprintf (stderr, " RunAllToAllBench::SetGpu for rank %d error: %d\n " , rank, cuda_err );
917
917
return -1 ;
918
918
}
919
919
@@ -924,7 +924,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
924
924
cuda_err = GpuMallocDataBuf (&(src_buffers_gpu[rank]), opts.size );
925
925
#endif
926
926
if (cuda_err != cudaSuccess) {
927
- fprintf (stderr, " RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n " , cuda_err, rank );
927
+ fprintf (stderr, " RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n " , rank, cuda_err );
928
928
return -1 ;
929
929
}
930
930
if (opts.check_data ) {
@@ -933,7 +933,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
933
933
}
934
934
cuda_err = cudaMemcpy (src_buffers_gpu[rank], data_buffer_cpu, opts.size , cudaMemcpyDefault);
935
935
if (cuda_err != cudaSuccess) {
936
- fprintf (stderr, " RunAllToAllBench::cudaMemcpy to src_buffers_gpu[%d] error: %d\n " , cuda_err, rank );
936
+ fprintf (stderr, " RunAllToAllBench::cudaMemcpy to src_buffers_gpu[%d] error: %d\n " , rank, cuda_err );
937
937
return -1 ;
938
938
}
939
939
}
@@ -945,7 +945,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
945
945
cuda_err = GpuMallocDataBuf (&(dst_buffers_gpu[rank]), opts.size );
946
946
#endif
947
947
if (cuda_err != cudaSuccess) {
948
- fprintf (stderr, " RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n " , cuda_err, rank );
948
+ fprintf (stderr, " RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n " , rank, cuda_err );
949
949
return -1 ;
950
950
}
951
951
@@ -959,33 +959,33 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
959
959
// Prepare events
960
960
cuda_err = cudaEventCreate (&(start_events[rank]));
961
961
if (cuda_err != cudaSuccess) {
962
- fprintf (stderr, " RunAllToAllBench::cudaEventCreate for start_events[%d] error: %d\n " , cuda_err, rank );
962
+ fprintf (stderr, " RunAllToAllBench::cudaEventCreate for start_events[%d] error: %d\n " , rank, cuda_err );
963
963
return -1 ;
964
964
}
965
965
cuda_err = cudaEventCreate (&(stop_events[rank]));
966
966
if (cuda_err != cudaSuccess) {
967
- fprintf (stderr, " RunAllToAllBench::cudaEventCreate for stop_events[%d] error: %d\n " , cuda_err, rank );
967
+ fprintf (stderr, " RunAllToAllBench::cudaEventCreate for stop_events[%d] error: %d\n " , rank, cuda_err );
968
968
return -1 ;
969
969
}
970
970
}
971
971
972
972
// Prepare kernel arguments
973
973
for (int rank = 0 ; rank < gpu_count; rank++) {
974
974
if (SetGpu (rank)) {
975
- fprintf (stderr, " RunAllToAllBench::SetGpu for rank %d error: %d\n " , cuda_err, rank );
975
+ fprintf (stderr, " RunAllToAllBench::SetGpu for rank %d error: %d\n " , rank, cuda_err );
976
976
return -1 ;
977
977
}
978
978
979
979
// Prepare destination buffer args
980
980
cuda_err = cudaMalloc (&(dst_buffer_gpu_args[rank]), sizeof (uint8_t *) * gpu_count);
981
981
if (cuda_err != cudaSuccess) {
982
- fprintf (stderr, " RunAllToAllBench::cudaMalloc for dst_buffer_gpu_args[%d] error: %d\n " , cuda_err, rank );
982
+ fprintf (stderr, " RunAllToAllBench::cudaMalloc for dst_buffer_gpu_args[%d] error: %d\n " , rank, cuda_err );
983
983
return -1 ;
984
984
}
985
985
cuda_err = cudaMemcpy (dst_buffer_gpu_args[rank], dst_buffers_gpu.data (), sizeof (uint8_t *) * gpu_count,
986
986
cudaMemcpyDefault);
987
987
if (cuda_err != cudaSuccess) {
988
- fprintf (stderr, " RunAllToAllBench::cudaMemcpy to dst_buffer_gpu_args[%d] error: %d\n " , cuda_err, rank );
988
+ fprintf (stderr, " RunAllToAllBench::cudaMemcpy to dst_buffer_gpu_args[%d] error: %d\n " , rank, cuda_err );
989
989
return -1 ;
990
990
}
991
991
}
@@ -998,15 +998,15 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
998
998
}
999
999
1000
1000
if (SetGpu (rank)) {
1001
- fprintf (stderr, " RunAllToAllBench::SetGpu for rank %d error: %d\n " , cuda_err, rank );
1001
+ fprintf (stderr, " RunAllToAllBench::SetGpu for rank %d error: %d\n " , rank, cuda_err );
1002
1002
return -1 ;
1003
1003
}
1004
1004
1005
1005
if (i == opts.num_warm_up ) {
1006
1006
cuda_err = cudaEventRecord (start_events[rank], streams[rank]);
1007
1007
if (cuda_err != cudaSuccess) {
1008
- fprintf (stderr, " RunAllToAllBench::cudaEventRecord for start_events[%d] error: %d\n " , cuda_err ,
1009
- rank );
1008
+ fprintf (stderr, " RunAllToAllBench::cudaEventRecord for start_events[%d] error: %d\n " , rank ,
1009
+ cuda_err );
1010
1010
return -1 ;
1011
1011
}
1012
1012
}
@@ -1017,8 +1017,8 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
1017
1017
if (i == opts.num_warm_up + opts.num_loops - 1 ) {
1018
1018
cuda_err = cudaEventRecord (stop_events[rank], streams[rank]);
1019
1019
if (cuda_err != cudaSuccess) {
1020
- fprintf (stderr, " RunAllToAllBench::cudaEventRecord for stop_events[%d] error: %d\n " , cuda_err ,
1021
- rank );
1020
+ fprintf (stderr, " RunAllToAllBench::cudaEventRecord for stop_events[%d] error: %d\n " , rank ,
1021
+ cuda_err );
1022
1022
return -1 ;
1023
1023
}
1024
1024
}
@@ -1030,7 +1030,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
1030
1030
}
1031
1031
cuda_err = cudaStreamSynchronize (streams[rank]);
1032
1032
if (cuda_err != cudaSuccess) {
1033
- fprintf (stderr, " RunAllToAllBench::cudaStreamSynchronize streams[%d] error: %d\n " , cuda_err, rank );
1033
+ fprintf (stderr, " RunAllToAllBench::cudaStreamSynchronize streams[%d] error: %d\n " , rank, cuda_err );
1034
1034
return -1 ;
1035
1035
}
1036
1036
}
@@ -1045,7 +1045,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
1045
1045
float time_in_ms = 0 ;
1046
1046
cuda_err = cudaEventElapsedTime (&time_in_ms, start_events[rank], stop_events[rank]);
1047
1047
if (cuda_err != cudaSuccess) {
1048
- fprintf (stderr, " RunAllToAllBench::cudaEventElapsedTime for rank %d error: %d\n " , cuda_err, rank );
1048
+ fprintf (stderr, " RunAllToAllBench::cudaEventElapsedTime for rank %d error: %d\n " , rank, cuda_err );
1049
1049
return -1 ;
1050
1050
}
1051
1051
double bw = opts.size * (gpu_count - 1 ) * opts.num_loops / gpu_count / time_in_ms / 1e6 ;
@@ -1072,8 +1072,8 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
1072
1072
}
1073
1073
cuda_err = cudaMemcpy (data_buffer_cpu, dst_buffers_gpu[curr_dst_rank], opts.size , cudaMemcpyDefault);
1074
1074
if (cuda_err != cudaSuccess) {
1075
- fprintf (stderr, " RunAllToAllBench::cudaMemcpy from dst_buffers_gpu[%d] error: %d\n " , cuda_err ,
1076
- curr_dst_rank );
1075
+ fprintf (stderr, " RunAllToAllBench::cudaMemcpy from dst_buffers_gpu[%d] error: %d\n " , curr_dst_rank ,
1076
+ cuda_err );
1077
1077
return -1 ;
1078
1078
}
1079
1079
for (uint64_t i = 0 ; i < opts.size / sizeof (uint64_t ); i++) {
@@ -1100,17 +1100,17 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
1100
1100
for (int rank = 0 ; rank < gpu_count; rank++) {
1101
1101
cuda_err = cudaFree (src_buffers_gpu[rank]);
1102
1102
if (cuda_err != cudaSuccess) {
1103
- fprintf (stderr, " RunAllToAllBench::cudaFree for src_buffers_gpu[%d] error: %d\n " , cuda_err, rank );
1103
+ fprintf (stderr, " RunAllToAllBench::cudaFree for src_buffers_gpu[%d] error: %d\n " , rank, cuda_err );
1104
1104
return -1 ;
1105
1105
}
1106
1106
cuda_err = cudaFree (dst_buffers_gpu[rank]);
1107
1107
if (cuda_err != cudaSuccess) {
1108
- fprintf (stderr, " RunAllToAllBench::cudaFree for dst_buffers_gpu[%d] error: %d\n " , cuda_err, rank );
1108
+ fprintf (stderr, " RunAllToAllBench::cudaFree for dst_buffers_gpu[%d] error: %d\n " , rank, cuda_err );
1109
1109
return -1 ;
1110
1110
}
1111
1111
cuda_err = cudaFree (dst_buffer_gpu_args[rank]);
1112
1112
if (cuda_err != cudaSuccess) {
1113
- fprintf (stderr, " RunAllToAllBench::cudaFree for dst_buffer_gpu_args[%d] error: %d\n " , cuda_err, rank );
1113
+ fprintf (stderr, " RunAllToAllBench::cudaFree for dst_buffer_gpu_args[%d] error: %d\n " , rank, cuda_err );
1114
1114
return -1 ;
1115
1115
}
1116
1116
}
0 commit comments