diff --git a/README.md b/README.md index a8977a1d..cc5f75d5 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ | ✔️ [hgemm_t_8x8_sliced_k_f16x4_pack](./hgemm/hgemm.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️| | ✔️ [hgemm_t_8x8_sliced_k_f16x8_pack](./hgemm/hgemm.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️| | ✔️ [hgemm_t_8x8_sliced_k_..._bcf](./hgemm/hgemm.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️| +| ✔️ [hgemm_t_8x8_sliced_k_..._dbuf](./hgemm/hgemm.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️| | ✔️ [sgemv_k32_f32](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️| | ✔️ [sgemv_k128_f32x4](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️| | ✔️ [sgemv_k16_f32](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️| diff --git a/hgemm/README.md b/hgemm/README.md index 61b94fbb..b0517ffb 100755 --- a/hgemm/README.md +++ b/hgemm/README.md @@ -11,6 +11,7 @@ - [X] hgemm_t_8x8_sliced_k_f16x4_pack_bcf_kernel(bank conflicts reduce, pack) - [X] hgemm_t_4x4_sliced_k_f16x4_pack_bcf_kernel(bank conflicts reduce, pack) - [X] hgemm_t_8x8_sliced_k_f16x8_pack_bcf_kernel(bank conflicts reduce, pack) +- [X] hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf_kernel(bank conflicts reduce, pack, double buffers) - [X] PyTorch bindings ## 共享内存 Bank Conflicts @@ -47,407 +48,434 @@ python3 hgemm.py ```bash -------------------------------------------------------------------------------------------------------------- M=1024, N=1024, K=256 - out_f16: ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.162156ms - out_f16(sk): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.121964ms - out_f16x4pack(t4x4bcf): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.021611ms - out_f16x4pack(t4x4offset): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.021212ms - out_f16x4(t8x8sk): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.028727ms - out_f16x4(t8x8bcf): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.025299ms - out_f16x4pack(t8x8sk): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.026134ms - out_f16x4pack(bcf): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.023820ms - out_f16x4pack(offset): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.023861ms - out_f16x8pack(bcf): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.023755ms - out_f16x8pack(offset): ['21.484375 ', '-1.95996094 ', '-3.66210938 '], time:0.023767ms - out_f16_th: ['21.453125 ', '-2.02734375 ', '-3.62304688 '], time:0.010352ms + out_f16: ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.162152ms + out_f16(sk): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.121955ms + out_f16x4pack(t4x4bcf): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.021595ms + out_f16x4pack(t4x4offset): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.021180ms + out_f16x4(t8x8sk): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.028679ms + out_f16x4(t8x8bcf): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.025246ms + out_f16x4pack(t8x8sk): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.026116ms + out_f16x4pack(bcf): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.023810ms + out_f16x4pack(bcf+offset): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.023824ms + out_f16x8pack(bcf): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.023782ms + out_f16x8pack(bcf+offset): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.023645ms + out_f16x8pack(dbuf): ['-23.90625 ', '29.125 ', '12.6484375 '], time:0.019011ms + out_f16_th: ['-23.828125 ', '29.046875 ', '12.6484375 '], time:0.010310ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=1024, K=512 - out_f16: ['12.0546875 ', '25.71875 ', '46.0 '], time:0.319458ms - out_f16(sk): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.238683ms - out_f16x4pack(t4x4bcf): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.041040ms - out_f16x4pack(t4x4offset): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.040317ms - out_f16x4(t8x8sk): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.052266ms - out_f16x4(t8x8bcf): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.048014ms - out_f16x4pack(t8x8sk): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.048636ms - out_f16x4pack(bcf): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.045189ms - out_f16x4pack(offset): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.045024ms - out_f16x8pack(bcf): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.045140ms - out_f16x8pack(offset): ['12.0546875 ', '25.71875 ', '46.0 '], time:0.045192ms - out_f16_th: ['12.0703125 ', '25.734375 ', '45.75 '], time:0.016851ms + out_f16: ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.319457ms + out_f16(sk): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.238678ms + out_f16x4pack(t4x4bcf): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.041019ms + out_f16x4pack(t4x4offset): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.040299ms + out_f16x4(t8x8sk): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.052254ms + out_f16x4(t8x8bcf): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.047995ms + out_f16x4pack(t8x8sk): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.048609ms + out_f16x4pack(bcf): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.045190ms + out_f16x4pack(bcf+offset): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.044997ms + out_f16x8pack(bcf): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.045103ms + out_f16x8pack(bcf+offset): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.044857ms + out_f16x8pack(dbuf): ['-8.890625 ', '-32.46875 ', '7.9375 '], time:0.035505ms + out_f16_th: ['-8.8515625 ', '-32.5 ', '7.9140625 '], time:0.016819ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=1024, K=1024 - out_f16: ['62.8125 ', '-33.75 ', '36.90625 '], time:0.634046ms - out_f16(sk): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.471801ms - out_f16x4pack(t4x4bcf): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.079749ms - out_f16x4pack(t4x4offset): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.078303ms - out_f16x4(t8x8sk): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.099667ms - out_f16x4(t8x8bcf): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.093594ms - out_f16x4pack(t8x8sk): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.093852ms - out_f16x4pack(bcf): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.087847ms - out_f16x4pack(offset): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.087496ms - out_f16x8pack(bcf): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.087731ms - out_f16x8pack(offset): ['62.8125 ', '-33.75 ', '36.90625 '], time:0.087965ms - out_f16_th: ['62.65625 ', '-33.96875 ', '36.8125 '], time:0.029954ms + out_f16: ['53.28125 ', '27.75 ', '-18.296875 '], time:0.634142ms + out_f16(sk): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.471824ms + out_f16x4pack(t4x4bcf): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.079725ms + out_f16x4pack(t4x4offset): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.078288ms + out_f16x4(t8x8sk): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.099672ms + out_f16x4(t8x8bcf): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.093570ms + out_f16x4pack(t8x8sk): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.093840ms + out_f16x4pack(bcf): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.087830ms + out_f16x4pack(bcf+offset): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.087477ms + out_f16x8pack(bcf): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.087723ms + out_f16x8pack(bcf+offset): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.087284ms + out_f16x8pack(dbuf): ['53.28125 ', '27.75 ', '-18.296875 '], time:0.068520ms + out_f16_th: ['53.125 ', '27.421875 ', '-18.328125 '], time:0.029914ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=2048, K=256 - out_f16: ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.309604ms - out_f16(sk): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.232643ms - out_f16x4pack(t4x4bcf): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.037819ms - out_f16x4pack(t4x4offset): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.037167ms - out_f16x4(t8x8sk): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.045149ms - out_f16x4(t8x8bcf): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.041175ms - out_f16x4pack(t8x8sk): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.040584ms - out_f16x4pack(bcf): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.038208ms - out_f16x4pack(offset): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.038253ms - out_f16x8pack(bcf): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.036594ms - out_f16x8pack(offset): ['1.30273438 ', '-9.2109375 ', '-33.375 '], time:0.036381ms - out_f16_th: ['1.28808594 ', '-9.21875 ', '-33.5 '], time:0.018030ms + out_f16: ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.309584ms + out_f16(sk): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.232592ms + out_f16x4pack(t4x4bcf): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.037788ms + out_f16x4pack(t4x4offset): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.037134ms + out_f16x4(t8x8sk): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.045142ms + out_f16x4(t8x8bcf): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.041173ms + out_f16x4pack(t8x8sk): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.040542ms + out_f16x4pack(bcf): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.038180ms + out_f16x4pack(bcf+offset): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.038236ms + out_f16x8pack(bcf): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.036573ms + out_f16x8pack(bcf+offset): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.036463ms + out_f16x8pack(dbuf): ['3.20507812 ', '15.2734375 ', '-8.65625 '], time:0.034016ms + out_f16_th: ['3.20507812 ', '15.2734375 ', '-8.6640625 '], time:0.018011ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=2048, K=512 - out_f16: ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.611522ms - out_f16(sk): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.456609ms - out_f16x4pack(t4x4bcf): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.073241ms - out_f16x4pack(t4x4offset): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.071995ms - out_f16x4(t8x8sk): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.084352ms - out_f16x4(t8x8bcf): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.079296ms - out_f16x4pack(t8x8sk): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.077518ms - out_f16x4pack(bcf): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.073532ms - out_f16x4pack(offset): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.073690ms - out_f16x8pack(bcf): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.070404ms - out_f16x8pack(offset): ['-33.40625 ', '-26.4375 ', '-9.6953125 '], time:0.069838ms - out_f16_th: ['-33.46875 ', '-26.546875 ', '-9.671875 '], time:0.031267ms + out_f16: ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.611498ms + out_f16(sk): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.456554ms + out_f16x4pack(t4x4bcf): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.073230ms + out_f16x4pack(t4x4offset): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.071980ms + out_f16x4(t8x8sk): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.084323ms + out_f16x4(t8x8bcf): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.079292ms + out_f16x4pack(t8x8sk): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.077398ms + out_f16x4pack(bcf): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.073493ms + out_f16x4pack(bcf+offset): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.073687ms + out_f16x8pack(bcf): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.070355ms + out_f16x8pack(bcf+offset): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.070046ms + out_f16x8pack(dbuf): ['-15.375 ', '-8.1015625 ', '-7.7578125 '], time:0.065057ms + out_f16_th: ['-15.359375 ', '-8.2421875 ', '-7.828125 '], time:0.031242ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=2048, K=1024 - out_f16: ['-77.9375 ', '25.234375 ', '-32.25 '], time:1.215316ms - out_f16(sk): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.904359ms - out_f16x4pack(t4x4bcf): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.145146ms - out_f16x4pack(t4x4offset): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.142806ms - out_f16x4(t8x8sk): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.162499ms - out_f16x4(t8x8bcf): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.155442ms - out_f16x4pack(t8x8sk): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.151216ms - out_f16x4pack(bcf): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.144004ms - out_f16x4pack(offset): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.144379ms - out_f16x8pack(bcf): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.137974ms - out_f16x8pack(offset): ['-77.9375 ', '25.234375 ', '-32.25 '], time:0.136572ms - out_f16_th: ['-78.0 ', '25.09375 ', '-32.15625 '], time:0.057608ms + out_f16: ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:1.215085ms + out_f16(sk): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.904187ms + out_f16x4pack(t4x4bcf): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.145166ms + out_f16x4pack(t4x4offset): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.142814ms + out_f16x4(t8x8sk): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.162487ms + out_f16x4(t8x8bcf): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.155417ms + out_f16x4pack(t8x8sk): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.151247ms + out_f16x4pack(bcf): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.143991ms + out_f16x4pack(bcf+offset): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.144398ms + out_f16x8pack(bcf): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.138216ms + out_f16x8pack(bcf+offset): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.137445ms + out_f16x8pack(dbuf): ['3.85742188 ', '-2.390625 ', '-16.46875 '], time:0.127822ms + out_f16_th: ['3.88867188 ', '-2.41992188 ', '-16.484375 '], time:0.057582ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=4096, K=256 - out_f16: ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.606133ms - out_f16(sk): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.454601ms - out_f16x4pack(t4x4bcf): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.070374ms - out_f16x4pack(t4x4offset): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.069029ms - out_f16x4(t8x8sk): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.065957ms - out_f16x4(t8x8bcf): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.057919ms - out_f16x4pack(t8x8sk): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.058044ms - out_f16x4pack(bcf): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.053713ms - out_f16x4pack(offset): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.053627ms - out_f16x8pack(bcf): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.052321ms - out_f16x8pack(offset): ['-3.01171875 ', '-1.85058594 ', '-8.6796875 '], time:0.051835ms - out_f16_th: ['-3.00390625 ', '-1.84277344 ', '-8.6640625 '], time:0.025358ms + out_f16: ['13.5234375 ', '29.640625 ', '26.5 '], time:0.606235ms + out_f16(sk): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.454569ms + out_f16x4pack(t4x4bcf): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.070314ms + out_f16x4pack(t4x4offset): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.068989ms + out_f16x4(t8x8sk): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.065690ms + out_f16x4(t8x8bcf): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.057906ms + out_f16x4pack(t8x8sk): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.058073ms + out_f16x4pack(bcf): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.053701ms + out_f16x4pack(bcf+offset): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.053577ms + out_f16x8pack(bcf): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.052278ms + out_f16x8pack(bcf+offset): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.052042ms + out_f16x8pack(dbuf): ['13.5234375 ', '29.640625 ', '26.5 '], time:0.049119ms + out_f16_th: ['13.5625 ', '29.609375 ', '26.546875 '], time:0.025352ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=4096, K=512 - out_f16: ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:1.198144ms - out_f16(sk): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.893668ms - out_f16x4pack(t4x4bcf): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.139338ms - out_f16x4pack(t4x4offset): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.136840ms - out_f16x4(t8x8sk): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.120734ms - out_f16x4(t8x8bcf): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.111685ms - out_f16x4pack(t8x8sk): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.110269ms - out_f16x4pack(bcf): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.103436ms - out_f16x4pack(offset): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.103198ms - out_f16x8pack(bcf): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.100174ms - out_f16x8pack(offset): ['-9.0625 ', '8.828125 ', '-13.2265625 '], time:0.099486ms - out_f16_th: ['-9.0078125 ', '8.78125 ', '-13.1875 '], time:0.044984ms + out_f16: ['16.515625 ', '0.93505859 ', '-17.1875 '], time:1.197983ms + out_f16(sk): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.893652ms + out_f16x4pack(t4x4bcf): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.139344ms + out_f16x4pack(t4x4offset): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.136861ms + out_f16x4(t8x8sk): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.120757ms + out_f16x4(t8x8bcf): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.111687ms + out_f16x4pack(t8x8sk): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.110304ms + out_f16x4pack(bcf): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.103440ms + out_f16x4pack(bcf+offset): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.103147ms + out_f16x8pack(bcf): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.100182ms + out_f16x8pack(bcf+offset): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.099555ms + out_f16x8pack(dbuf): ['16.515625 ', '0.93505859 ', '-17.1875 '], time:0.094203ms + out_f16_th: ['16.484375 ', '0.81054688 ', '-17.140625 '], time:0.044953ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=1024, N=4096, K=1024 - out_f16: ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:2.382674ms - out_f16(sk): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:1.771703ms - out_f16x4pack(t4x4bcf): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.277116ms - out_f16x4pack(t4x4offset): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.273381ms - out_f16x4(t8x8sk): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.231652ms - out_f16x4(t8x8bcf): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.219471ms - out_f16x4pack(t8x8sk): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.215337ms - out_f16x4pack(bcf): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.202957ms - out_f16x4pack(offset): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.202289ms - out_f16x8pack(bcf): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.196815ms - out_f16x8pack(offset): ['-29.015625 ', '5.1484375 ', '14.8828125 '], time:0.195075ms - out_f16_th: ['-29.0625 ', '5.1015625 ', '14.984375 '], time:0.084119ms + out_f16: ['27.5625 ', '24.140625 ', '-13.1796875 '], time:2.382421ms + out_f16(sk): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:1.771888ms + out_f16x4pack(t4x4bcf): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.277140ms + out_f16x4pack(t4x4offset): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.273262ms + out_f16x4(t8x8sk): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.230666ms + out_f16x4(t8x8bcf): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.219387ms + out_f16x4pack(t8x8sk): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.215188ms + out_f16x4pack(bcf): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.202930ms + out_f16x4pack(bcf+offset): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.202336ms + out_f16x8pack(bcf): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.196809ms + out_f16x8pack(bcf+offset): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.195358ms + out_f16x8pack(dbuf): ['27.5625 ', '24.140625 ', '-13.1796875 '], time:0.184610ms + out_f16_th: ['27.78125 ', '24.15625 ', '-13.3359375 '], time:0.084101ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=1024, K=256 - out_f16: ['22.59375 ', '25.15625 ', '20.390625 '], time:0.309613ms - out_f16(sk): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.232681ms - out_f16x4pack(t4x4bcf): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.037872ms - out_f16x4pack(t4x4offset): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.037215ms - out_f16x4(t8x8sk): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.044645ms - out_f16x4(t8x8bcf): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.041085ms - out_f16x4pack(t8x8sk): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.040396ms - out_f16x4pack(bcf): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.038233ms - out_f16x4pack(offset): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.038226ms - out_f16x8pack(bcf): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.036567ms - out_f16x8pack(offset): ['22.59375 ', '25.15625 ', '20.390625 '], time:0.036368ms - out_f16_th: ['22.59375 ', '25.140625 ', '20.40625 '], time:0.018016ms + out_f16: ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.309631ms + out_f16(sk): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.232697ms + out_f16x4pack(t4x4bcf): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.037850ms + out_f16x4pack(t4x4offset): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.037194ms + out_f16x4(t8x8sk): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.044630ms + out_f16x4(t8x8bcf): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.041074ms + out_f16x4pack(t8x8sk): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.040302ms + out_f16x4pack(bcf): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.038220ms + out_f16x4pack(bcf+offset): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.038198ms + out_f16x8pack(bcf): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.036558ms + out_f16x8pack(bcf+offset): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.036434ms + out_f16x8pack(dbuf): ['31.53125 ', '9.625 ', '-2.05859375 '], time:0.034007ms + out_f16_th: ['31.515625 ', '9.65625 ', '-2.06445312 '], time:0.017991ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=1024, K=512 - out_f16: ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.611553ms - out_f16(sk): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.456971ms - out_f16x4pack(t4x4bcf): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.073360ms - out_f16x4pack(t4x4offset): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.071994ms - out_f16x4(t8x8sk): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.083932ms - out_f16x4(t8x8bcf): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.079230ms - out_f16x4pack(t8x8sk): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.077116ms - out_f16x4pack(bcf): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.073698ms - out_f16x4pack(offset): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.073669ms - out_f16x8pack(bcf): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.070330ms - out_f16x8pack(offset): ['-20.421875 ', '3.49023438 ', '32.1875 '], time:0.069845ms - out_f16_th: ['-20.4375 ', '3.50976562 ', '32.1875 '], time:0.031190ms + out_f16: ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.611916ms + out_f16(sk): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.456705ms + out_f16x4pack(t4x4bcf): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.073276ms + out_f16x4pack(t4x4offset): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.071982ms + out_f16x4(t8x8sk): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.083909ms + out_f16x4(t8x8bcf): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.079222ms + out_f16x4pack(t8x8sk): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.077219ms + out_f16x4pack(bcf): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.073688ms + out_f16x4pack(bcf+offset): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.073673ms + out_f16x8pack(bcf): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.070326ms + out_f16x8pack(bcf+offset): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.070006ms + out_f16x8pack(dbuf): ['-8.390625 ', '38.5625 ', '2.90234375 '], time:0.065054ms + out_f16_th: ['-8.4140625 ', '38.40625 ', '2.91601562 '], time:0.031183ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=1024, K=1024 - out_f16: ['-11.984375 ', '13.859375 ', '3.01367188 '], time:1.215566ms - out_f16(sk): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.904491ms - out_f16x4pack(t4x4bcf): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.145355ms - out_f16x4pack(t4x4offset): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.142924ms - out_f16x4(t8x8sk): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.162164ms - out_f16x4(t8x8bcf): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.155778ms - out_f16x4pack(t8x8sk): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.150889ms - out_f16x4pack(bcf): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.144444ms - out_f16x4pack(offset): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.144348ms - out_f16x8pack(bcf): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.138090ms - out_f16x8pack(offset): ['-11.984375 ', '13.859375 ', '3.01367188 '], time:0.136937ms - out_f16_th: ['-11.9296875 ', '13.9140625 ', '3.02148438 '], time:0.057563ms + out_f16: ['7.14453125 ', '-34.46875 ', '33.28125 '], time:1.215496ms + out_f16(sk): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.904354ms + out_f16x4pack(t4x4bcf): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.145215ms + out_f16x4pack(t4x4offset): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.142813ms + out_f16x4(t8x8sk): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.162309ms + out_f16x4(t8x8bcf): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.155714ms + out_f16x4pack(t8x8sk): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.150831ms + out_f16x4pack(bcf): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.144459ms + out_f16x4pack(bcf+offset): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.144314ms + out_f16x8pack(bcf): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.137980ms + out_f16x8pack(bcf+offset): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.137455ms + out_f16x8pack(dbuf): ['7.14453125 ', '-34.46875 ', '33.28125 '], time:0.127817ms + out_f16_th: ['6.9609375 ', '-34.15625 ', '33.5 '], time:0.057532ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=2048, K=256 - out_f16: ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.606369ms - out_f16(sk): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.455201ms - out_f16x4pack(t4x4bcf): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.070472ms - out_f16x4pack(t4x4offset): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.069127ms - out_f16x4(t8x8sk): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.067195ms - out_f16x4(t8x8bcf): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.057917ms - out_f16x4pack(t8x8sk): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.058713ms - out_f16x4pack(bcf): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.053779ms - out_f16x4pack(offset): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.053728ms - out_f16x8pack(bcf): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.052478ms - out_f16x8pack(offset): ['8.8203125 ', '-3.16601562 ', '13.7265625 '], time:0.052156ms - out_f16_th: ['8.8359375 ', '-3.15820312 ', '13.71875 '], time:0.025408ms + out_f16: ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.606350ms + out_f16(sk): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.455183ms + out_f16x4pack(t4x4bcf): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.070434ms + out_f16x4pack(t4x4offset): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.069097ms + out_f16x4(t8x8sk): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.067122ms + out_f16x4(t8x8bcf): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.057893ms + out_f16x4pack(t8x8sk): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.058693ms + out_f16x4pack(bcf): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.053744ms + out_f16x4pack(bcf+offset): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.053674ms + out_f16x8pack(bcf): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.052449ms + out_f16x8pack(bcf+offset): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.052198ms + out_f16x8pack(dbuf): ['1.84375 ', '-8.0234375 ', '-5.62890625 '], time:0.049208ms + out_f16_th: ['1.82617188 ', '-8.015625 ', '-5.65234375 '], time:0.025376ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=2048, K=512 - out_f16: ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:1.198213ms - out_f16(sk): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.894974ms - out_f16x4pack(t4x4bcf): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.139549ms - out_f16x4pack(t4x4offset): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.137022ms - out_f16x4(t8x8sk): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.122213ms - out_f16x4(t8x8bcf): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.111684ms - out_f16x4pack(t8x8sk): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.111181ms - out_f16x4pack(bcf): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.103350ms - out_f16x4pack(offset): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.103209ms - out_f16x8pack(bcf): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.100302ms - out_f16x8pack(offset): ['-12.1328125 ', '-3.8203125 ', '15.4375 '], time:0.100054ms - out_f16_th: ['-12.1328125 ', '-3.82421875 ', '15.46875 '], time:0.045010ms + out_f16: ['28.15625 ', '43.09375 ', '-8.765625 '], time:1.198318ms + out_f16(sk): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.894948ms + out_f16x4pack(t4x4bcf): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.139518ms + out_f16x4pack(t4x4offset): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.136982ms + out_f16x4(t8x8sk): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.122198ms + out_f16x4(t8x8bcf): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.111648ms + out_f16x4pack(t8x8sk): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.111161ms + out_f16x4pack(bcf): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.103313ms + out_f16x4pack(bcf+offset): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.103238ms + out_f16x8pack(bcf): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.100296ms + out_f16x8pack(bcf+offset): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.099789ms + out_f16x8pack(dbuf): ['28.15625 ', '43.09375 ', '-8.765625 '], time:0.094228ms + out_f16_th: ['28.328125 ', '43.1875 ', '-8.7734375 '], time:0.044975ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=2048, K=1024 - out_f16: ['22.96875 ', '47.6875 ', '34.40625 '], time:2.382358ms - out_f16(sk): ['22.96875 ', '47.6875 ', '34.40625 '], time:1.773636ms - out_f16x4pack(t4x4bcf): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.277883ms - out_f16x4pack(t4x4offset): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.273660ms - out_f16x4(t8x8sk): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.233389ms - out_f16x4(t8x8bcf): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.219998ms - out_f16x4pack(t8x8sk): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.216918ms - out_f16x4pack(bcf): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.203171ms - out_f16x4pack(offset): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.202978ms - out_f16x8pack(bcf): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.197327ms - out_f16x8pack(offset): ['22.96875 ', '47.6875 ', '34.40625 '], time:0.196782ms - out_f16_th: ['23.0 ', '47.78125 ', '34.34375 '], time:0.084282ms + out_f16: ['31.953125 ', '-24.375 ', '1.33691406 '], time:2.382188ms + out_f16(sk): ['31.953125 ', '-24.375 ', '1.33691406 '], time:1.773512ms + out_f16x4pack(t4x4bcf): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.277877ms + out_f16x4pack(t4x4offset): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.273361ms + out_f16x4(t8x8sk): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.233111ms + out_f16x4(t8x8bcf): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.220381ms + out_f16x4pack(t8x8sk): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.216933ms + out_f16x4pack(bcf): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.203425ms + out_f16x4pack(bcf+offset): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.202874ms + out_f16x8pack(bcf): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.197368ms + out_f16x8pack(bcf+offset): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.196184ms + out_f16x8pack(dbuf): ['31.953125 ', '-24.375 ', '1.33691406 '], time:0.185119ms + out_f16_th: ['32.03125 ', '-24.46875 ', '1.41210938 '], time:0.084420ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=4096, K=256 - out_f16: ['0.83447266 ', '1.921875 ', '2.62890625 '], time:1.204888ms - out_f16(sk): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.903167ms - out_f16x4pack(t4x4bcf): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.130934ms - out_f16x4pack(t4x4offset): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.128506ms - out_f16x4(t8x8sk): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.124031ms - out_f16x4(t8x8bcf): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.108457ms - out_f16x4pack(t8x8sk): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.108359ms - out_f16x4pack(bcf): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.099940ms - out_f16x4pack(offset): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.099585ms - out_f16x8pack(bcf): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.096716ms - out_f16x8pack(offset): ['0.83447266 ', '1.921875 ', '2.62890625 '], time:0.095844ms - out_f16_th: ['0.82666016 ', '1.90039062 ', '2.66015625 '], time:0.046674ms + out_f16: ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:1.205165ms + out_f16(sk): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.903791ms + out_f16x4pack(t4x4bcf): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.131277ms + out_f16x4pack(t4x4offset): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.128841ms + out_f16x4(t8x8sk): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.124274ms + out_f16x4(t8x8bcf): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.108666ms + out_f16x4pack(t8x8sk): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.108618ms + out_f16x4pack(bcf): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.100218ms + out_f16x4pack(bcf+offset): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.099845ms + out_f16x8pack(bcf): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.097036ms + out_f16x8pack(bcf+offset): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.096186ms + out_f16x8pack(dbuf): ['23.046875 ', '6.28515625 ', '-0.76660156 '], time:0.093224ms + out_f16_th: ['23.03125 ', '6.2890625 ', '-0.78417969 '], time:0.046790ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=4096, K=512 - out_f16: ['-38.25 ', '8.1484375 ', '35.84375 '], time:2.383915ms - out_f16(sk): ['-38.25 ', '8.1484375 ', '35.84375 '], time:1.776247ms - out_f16x4pack(t4x4bcf): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.259576ms - out_f16x4pack(t4x4offset): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.255759ms - out_f16x4(t8x8sk): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.231398ms - out_f16x4(t8x8bcf): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.211641ms - out_f16x4pack(t8x8sk): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.208231ms - out_f16x4pack(bcf): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.195185ms - out_f16x4pack(offset): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.194489ms - out_f16x8pack(bcf): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.188848ms - out_f16x8pack(offset): ['-38.25 ', '8.1484375 ', '35.84375 '], time:0.187315ms - out_f16_th: ['-38.5 ', '8.109375 ', '35.96875 '], time:0.089850ms + out_f16: ['21.1875 ', '16.765625 ', '-17.984375 '], time:2.384285ms + out_f16(sk): ['21.1875 ', '16.765625 ', '-17.984375 '], time:1.776543ms + out_f16x4pack(t4x4bcf): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.260412ms + out_f16x4pack(t4x4offset): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.256168ms + out_f16x4(t8x8sk): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.231487ms + out_f16x4(t8x8bcf): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.211654ms + out_f16x4pack(t8x8sk): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.208195ms + out_f16x4pack(bcf): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.195177ms + out_f16x4pack(bcf+offset): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.194470ms + out_f16x8pack(bcf): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.188873ms + out_f16x8pack(bcf+offset): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.187346ms + out_f16x8pack(dbuf): ['21.1875 ', '16.765625 ', '-17.984375 '], time:0.181744ms + out_f16_th: ['21.1875 ', '16.703125 ', '-17.96875 '], time:0.090159ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=2048, N=4096, K=1024 - out_f16: ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:4.740417ms - out_f16(sk): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:3.523793ms - out_f16x4pack(t4x4bcf): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.525656ms - out_f16x4pack(t4x4offset): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.514398ms - out_f16x4(t8x8sk): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.444274ms - out_f16x4(t8x8bcf): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.418189ms - out_f16x4pack(t8x8sk): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.408354ms - out_f16x4pack(bcf): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.384445ms - out_f16x4pack(offset): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.383688ms - out_f16x8pack(bcf): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.372740ms - out_f16x8pack(offset): ['-46.0625 ', '-31.671875 ', '7.41015625 '], time:0.371087ms - out_f16_th: ['-46.0625 ', '-31.84375 ', '7.41015625 '], time:0.168835ms + out_f16: ['25.28125 ', '-35.8125 ', '-34.90625 '], time:4.740940ms + out_f16(sk): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:3.523728ms + out_f16x4pack(t4x4bcf): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.526218ms + out_f16x4pack(t4x4offset): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.514424ms + out_f16x4(t8x8sk): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.444150ms + out_f16x4(t8x8bcf): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.418200ms + out_f16x4pack(t8x8sk): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.407552ms + out_f16x4pack(bcf): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.384386ms + out_f16x4pack(bcf+offset): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.383409ms + out_f16x8pack(bcf): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.372210ms + out_f16x8pack(bcf+offset): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.370581ms + out_f16x8pack(dbuf): ['25.28125 ', '-35.8125 ', '-34.90625 '], time:0.359300ms + out_f16_th: ['25.21875 ', '-36.03125 ', '-34.84375 '], time:0.168989ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=1024, K=256 - out_f16: ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.606685ms - out_f16(sk): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.455339ms - out_f16x4pack(t4x4bcf): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.070524ms - out_f16x4pack(t4x4offset): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.069181ms - out_f16x4(t8x8sk): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.065643ms - out_f16x4(t8x8bcf): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.057961ms - out_f16x4pack(t8x8sk): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.058129ms - out_f16x4pack(bcf): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.053869ms - out_f16x4pack(offset): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.053774ms - out_f16x8pack(bcf): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.052531ms - out_f16x8pack(offset): ['11.09375 ', '-4.30078125 ', '28.75 '], time:0.052143ms - out_f16_th: ['11.09375 ', '-4.328125 ', '28.78125 '], time:0.025280ms + out_f16: ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.606902ms + out_f16(sk): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.456121ms + out_f16x4pack(t4x4bcf): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.070778ms + out_f16x4pack(t4x4offset): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.069436ms + out_f16x4(t8x8sk): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.065789ms + out_f16x4(t8x8bcf): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.058092ms + out_f16x4pack(t8x8sk): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.058286ms + out_f16x4pack(bcf): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.053996ms + out_f16x4pack(bcf+offset): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.053891ms + out_f16x8pack(bcf): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.052655ms + out_f16x8pack(bcf+offset): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.052357ms + out_f16x8pack(dbuf): ['5.2265625 ', '-14.3046875 ', '12.6953125 '], time:0.049304ms + out_f16_th: ['5.20703125 ', '-14.28125 ', '12.625 '], time:0.025346ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=1024, K=512 - out_f16: ['6.16796875 ', '23.28125 ', '63.8125 '], time:1.199290ms - out_f16(sk): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.894740ms - out_f16x4pack(t4x4bcf): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.139941ms - out_f16x4pack(t4x4offset): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.137491ms - out_f16x4(t8x8sk): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.120988ms - out_f16x4(t8x8bcf): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.111963ms - out_f16x4pack(t8x8sk): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.110834ms - out_f16x4pack(bcf): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.103897ms - out_f16x4pack(offset): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.103645ms - out_f16x8pack(bcf): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.100538ms - out_f16x8pack(offset): ['6.16796875 ', '23.28125 ', '63.8125 '], time:0.100353ms - out_f16_th: ['6.23046875 ', '23.265625 ', '63.40625 '], time:0.044990ms + out_f16: ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:1.199327ms + out_f16(sk): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.895021ms + out_f16x4pack(t4x4bcf): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.139593ms + out_f16x4pack(t4x4offset): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.137124ms + out_f16x4(t8x8sk): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.120680ms + out_f16x4(t8x8bcf): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.111681ms + out_f16x4pack(t8x8sk): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.110540ms + out_f16x4pack(bcf): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.103607ms + out_f16x4pack(bcf+offset): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.103339ms + out_f16x8pack(bcf): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.100309ms + out_f16x8pack(bcf+offset): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.099732ms + out_f16x8pack(dbuf): ['6.765625 ', '-1.97167969 ', '-23.640625 '], time:0.094197ms + out_f16_th: ['6.71875 ', '-1.98535156 ', '-23.703125 '], time:0.044829ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=1024, K=1024 - out_f16: ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:2.384896ms - out_f16(sk): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:1.774733ms - out_f16x4pack(t4x4bcf): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.277911ms - out_f16x4pack(t4x4offset): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.273759ms - out_f16x4(t8x8sk): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.231657ms - out_f16x4(t8x8bcf): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.219461ms - out_f16x4pack(t8x8sk): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.215819ms - out_f16x4pack(bcf): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.203251ms - out_f16x4pack(offset): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.202671ms - out_f16x8pack(bcf): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.196928ms - out_f16x8pack(offset): ['-65.5 ', '-9.8515625 ', '-25.921875 '], time:0.195895ms - out_f16_th: ['-65.625 ', '-9.9375 ', '-26.03125 '], time:0.084260ms + out_f16: ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:2.383269ms + out_f16(sk): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:1.773585ms + out_f16x4pack(t4x4bcf): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.277344ms + out_f16x4pack(t4x4offset): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.273213ms + out_f16x4(t8x8sk): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.231479ms + out_f16x4(t8x8bcf): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.219741ms + out_f16x4pack(t8x8sk): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.215539ms + out_f16x4pack(bcf): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.202891ms + out_f16x4pack(bcf+offset): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.203077ms + out_f16x8pack(bcf): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.197309ms + out_f16x8pack(bcf+offset): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.196089ms + out_f16x8pack(dbuf): ['-18.734375 ', '23.484375 ', '-15.2890625 '], time:0.185151ms + out_f16_th: ['-18.640625 ', '23.484375 ', '-15.1875 '], time:0.084677ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=2048, K=256 - out_f16: ['-20.84375 ', '8.875 ', '-3.8828125 '], time:1.205260ms - out_f16(sk): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.905329ms - out_f16x4pack(t4x4bcf): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.131099ms - out_f16x4pack(t4x4offset): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.128661ms - out_f16x4(t8x8sk): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.128078ms - out_f16x4(t8x8bcf): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.108449ms - out_f16x4pack(t8x8sk): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.109743ms - out_f16x4pack(bcf): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.099809ms - out_f16x4pack(offset): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.099596ms - out_f16x8pack(bcf): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.096962ms - out_f16x8pack(offset): ['-20.84375 ', '8.875 ', '-3.8828125 '], time:0.095946ms - out_f16_th: ['-20.75 ', '8.875 ', '-3.875 '], time:0.046710ms + out_f16: ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:1.207559ms + out_f16(sk): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.904191ms + out_f16x4pack(t4x4bcf): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.131142ms + out_f16x4pack(t4x4offset): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.128715ms + out_f16x4(t8x8sk): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.128132ms + out_f16x4(t8x8bcf): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.108476ms + out_f16x4pack(t8x8sk): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.109819ms + out_f16x4pack(bcf): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.099860ms + out_f16x4pack(bcf+offset): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.099651ms + out_f16x8pack(bcf): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.097065ms + out_f16x8pack(bcf+offset): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.096196ms + out_f16x8pack(dbuf): ['0.18762207 ', '-16.328125 ', '-9.28125 '], time:0.093154ms + out_f16_th: ['0.20019531 ', '-16.359375 ', '-9.28125 '], time:0.046717ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=2048, K=512 - out_f16: ['26.390625 ', '27.8125 ', '-30.59375 '], time:2.384545ms - out_f16(sk): ['26.390625 ', '27.8125 ', '-30.59375 '], time:1.779222ms - out_f16x4pack(t4x4bcf): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.261164ms - out_f16x4pack(t4x4offset): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.256294ms - out_f16x4(t8x8sk): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.235556ms - out_f16x4(t8x8bcf): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.211694ms - out_f16x4pack(t8x8sk): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.209532ms - out_f16x4pack(bcf): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.194987ms - out_f16x4pack(offset): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.194519ms - out_f16x8pack(bcf): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.188973ms - out_f16x8pack(offset): ['26.390625 ', '27.8125 ', '-30.59375 '], time:0.187423ms - out_f16_th: ['26.40625 ', '27.640625 ', '-30.703125 '], time:0.086123ms + out_f16: ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:2.384526ms + out_f16(sk): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:1.778904ms + out_f16x4pack(t4x4bcf): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.259862ms + out_f16x4pack(t4x4offset): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.256946ms + out_f16x4(t8x8sk): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.235498ms + out_f16x4(t8x8bcf): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.211723ms + out_f16x4pack(t8x8sk): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.209528ms + out_f16x4pack(bcf): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.194957ms + out_f16x4pack(bcf+offset): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.194530ms + out_f16x8pack(bcf): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.189021ms + out_f16x8pack(bcf+offset): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.187663ms + out_f16x8pack(dbuf): ['-0.86279297 ', '-0.78173828 ', '-26.921875 '], time:0.181922ms + out_f16_th: ['-0.86572266 ', '-0.75537109 ', '-26.90625 '], time:0.085870ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=2048, K=1024 - out_f16: ['38.0625 ', '11.765625 ', '9.7734375 '], time:4.739503ms - out_f16(sk): ['38.0625 ', '11.765625 ', '9.7734375 '], time:3.528736ms - out_f16x4pack(t4x4bcf): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.524106ms - out_f16x4pack(t4x4offset): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.517267ms - out_f16x4(t8x8sk): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.448143ms - out_f16x4(t8x8bcf): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.418208ms - out_f16x4pack(t8x8sk): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.410209ms - out_f16x4pack(bcf): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.384880ms - out_f16x4pack(offset): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.384080ms - out_f16x8pack(bcf): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.374737ms - out_f16x8pack(offset): ['38.0625 ', '11.765625 ', '9.7734375 '], time:0.371141ms - out_f16_th: ['38.15625 ', '11.7890625 ', '9.71875 '], time:0.168827ms + out_f16: ['-30.9375 ', '22.625 ', '-8.75 '], time:4.738972ms + out_f16(sk): ['-30.9375 ', '22.625 ', '-8.75 '], time:3.529367ms + out_f16x4pack(t4x4bcf): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.526890ms + out_f16x4pack(t4x4offset): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.515642ms + out_f16x4(t8x8sk): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.448207ms + out_f16x4(t8x8bcf): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.418634ms + out_f16x4pack(t8x8sk): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.409610ms + out_f16x4pack(bcf): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.385201ms + out_f16x4pack(bcf+offset): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.384448ms + out_f16x8pack(bcf): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.373107ms + out_f16x8pack(bcf+offset): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.372995ms + out_f16x8pack(dbuf): ['-30.9375 ', '22.625 ', '-8.75 '], time:0.359854ms + out_f16_th: ['-30.953125 ', '22.734375 ', '-8.7578125 '], time:0.169054ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=4096, K=256 - out_f16: ['35.53125 ', '31.109375 ', '-0.60400391 '], time:2.400335ms - out_f16(sk): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:1.799110ms - out_f16x4pack(t4x4bcf): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.252537ms - out_f16x4pack(t4x4offset): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.247484ms - out_f16x4(t8x8sk): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.230805ms - out_f16x4(t8x8bcf): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.208215ms - out_f16x4pack(t8x8sk): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.206558ms - out_f16x4pack(bcf): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.191569ms - out_f16x4pack(offset): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.190972ms - out_f16x8pack(bcf): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.184975ms - out_f16x8pack(offset): ['35.53125 ', '31.109375 ', '-0.60400391 '], time:0.183313ms - out_f16_th: ['35.5 ', '31.0625 ', '-0.60791016 '], time:0.088493ms + out_f16: ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:2.403322ms + out_f16(sk): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:1.799120ms + out_f16x4pack(t4x4bcf): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.252224ms + out_f16x4pack(t4x4offset): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.250270ms + out_f16x4(t8x8sk): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.230155ms + out_f16x4(t8x8bcf): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.208173ms + out_f16x4pack(t8x8sk): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.206509ms + out_f16x4pack(bcf): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.191518ms + out_f16x4pack(bcf+offset): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.190922ms + out_f16x8pack(bcf): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.184900ms + out_f16x8pack(bcf+offset): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.183368ms + out_f16x8pack(dbuf): ['13.765625 ', '-1.97753906 ', '-14.953125 '], time:0.180867ms + out_f16_th: ['13.7421875 ', '-1.984375 ', '-14.9609375 '], time:0.088463ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=4096, K=512 - out_f16: ['20.671875 ', '33.28125 ', '-0.69433594 '], time:4.749888ms - out_f16(sk): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:3.542411ms - out_f16x4pack(t4x4bcf): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.513864ms - out_f16x4pack(t4x4offset): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.500157ms - out_f16x4(t8x8sk): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.441413ms - out_f16x4(t8x8bcf): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.410336ms - out_f16x4pack(t8x8sk): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.402255ms - out_f16x4pack(bcf): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.378996ms - out_f16x4pack(offset): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.376065ms - out_f16x8pack(bcf): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.366248ms - out_f16x8pack(offset): ['20.671875 ', '33.28125 ', '-0.69433594 '], time:0.364445ms - out_f16_th: ['20.5625 ', '33.21875 ', '-0.73828125 '], time:0.168390ms + out_f16: ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:4.749713ms + out_f16(sk): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:3.543025ms + out_f16x4pack(t4x4bcf): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.508850ms + out_f16x4pack(t4x4offset): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.505403ms + out_f16x4(t8x8sk): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.441533ms + out_f16x4(t8x8bcf): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.410424ms + out_f16x4pack(t8x8sk): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.403210ms + out_f16x4pack(bcf): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.376480ms + out_f16x4pack(bcf+offset): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.376418ms + out_f16x8pack(bcf): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.365713ms + out_f16x8pack(bcf+offset): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.362226ms + out_f16x8pack(dbuf): ['13.6796875 ', '-39.34375 ', '4.8046875 '], time:0.356183ms + out_f16_th: ['13.6953125 ', '-39.4375 ', '4.7734375 '], time:0.168072ms -------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------- M=4096, N=4096, K=1024 - out_f16: ['22.234375 ', '40.28125 ', '-24.578125 '], time:9.448379ms - out_f16(sk): ['22.234375 ', '40.28125 ', '-24.578125 '], time:7.028021ms - out_f16x4pack(t4x4bcf): ['22.234375 ', '40.28125 ', '-24.578125 '], time:1.032172ms - out_f16x4pack(t4x4offset): ['22.234375 ', '40.28125 ', '-24.578125 '], time:1.023668ms - out_f16x4(t8x8sk): ['22.234375 ', '40.28125 ', '-24.578125 '], time:0.857614ms - out_f16x4(t8x8bcf): ['22.234375 ', '40.28125 ', '-24.578125 '], time:0.817854ms - out_f16x4pack(t8x8sk): ['22.234375 ', '40.28125 ', '-24.578125 '], time:0.796847ms - out_f16x4pack(bcf): ['22.234375 ', '40.28125 ', '-24.578125 '], time:0.753374ms - out_f16x4pack(offset): ['22.234375 ', '40.28125 ', '-24.578125 '], time:0.750718ms - out_f16x8pack(bcf): ['22.234375 ', '40.28125 ', '-24.578125 '], time:0.732352ms - out_f16x8pack(offset): ['22.234375 ', '40.28125 ', '-24.578125 '], time:0.727910ms - out_f16_th: ['22.484375 ', '40.375 ', '-24.578125 '], time:0.333349ms + out_f16: ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:9.449276ms + out_f16(sk): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:7.027740ms + out_f16x4pack(t4x4bcf): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:1.033735ms + out_f16x4pack(t4x4offset): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:1.020987ms + out_f16x4(t8x8sk): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.857793ms + out_f16x4(t8x8bcf): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.814633ms + out_f16x4pack(t8x8sk): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.796247ms + out_f16x4pack(bcf): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.754777ms + out_f16x4pack(bcf+offset): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.754538ms + out_f16x8pack(bcf): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.731132ms + out_f16x8pack(bcf+offset): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.729123ms + out_f16x8pack(dbuf): ['-74.6875 ', '-5.484375 ', '1.04882812 '], time:0.720609ms + out_f16_th: ['-74.375 ', '-5.49609375 ', '1.1015625 '], time:0.334342ms -------------------------------------------------------------------------------------------------------------- ``` diff --git a/hgemm/hgemm.cu b/hgemm/hgemm.cu index a6b93b84..142f9aa6 100644 --- a/hgemm/hgemm.cu +++ b/hgemm/hgemm.cu @@ -694,6 +694,133 @@ __global__ void hgemm_t_8x8_sliced_k_f16x8_pack_bcf_kernel( } // TODO: Double Buffering support +template +__global__ void hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf_kernel( + half* a, half* b, half* c, const int M, const int N, const int K) { + // threads: 128/8 * 128/8 = 256 + const int bx = blockIdx.x; + const int by = blockIdx.y; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int tid = ty * blockDim.x + tx; + + __shared__ half s_a[2][BK][BM + OFFSET]; // 8*128*2=2KB + __shared__ half s_b[2][BK][BN + OFFSET]; // 8*128*2=2KB + + half r_load_a[TM/2]; // 4 + half r_load_b[TN/2]; // 4 + half r_comp_a[TM]; // 8 + half r_comp_b[TN]; // 8 + half r_c[TM][TN] = {__float2half(0.0f)}; // 8x8 + + // mapping tid to s_a[BK][BM], for each orginal m-th row, load 4 + 4 K-dim + // row major values from A matrix, and store it in COL major s_a[BK][BM]. + int load_a_smem_m = tid / 2; // tid / 2,(0,1,2,...,128) + // (0b00000000 & 0b00000001) << 2 = 0 + // (0b00000001 & 0b00000001) << 2 = 4 + // (0b00000010 & 0b00000001) << 2 = 0 + // (0b00000011 & 0b00000001) << 2 = 4 + int load_a_smem_k = (tid & 1) << 2; // (0,4) + // mapping tid to s_b[BK][BN], for each orginal k-th row, load 4 + 4 N-dim + // row major values from B matrix, and store it in ROW major s_b[BK][BN]. + int load_b_smem_k = tid / 32; // 0~8 + // (0b00000000 & 0b00011111) << 2 = 0 + // (0b00000001 & 0b00011111) << 2 = 4 + // (0b00000010 & 0b00011111) << 2 = 8 + // (0b00000011 & 0b00011111) << 2 = 12 + int load_b_smem_n = (tid & 31) << 2; // (0,4,8,12,...,124) + + int load_a_gmem_m = by * BM + load_a_smem_m; + int load_b_gmem_n = bx * BN + load_b_smem_n; + + // 1)主循环从bk = 1 开始,第一次数据加载在主循环之前,最后一次计算在主循环之后,这是pipeline 的特点决定的; + // 2)由于计算和下一次访存使用的Shared Memory不同,因此主循环中每次循环只需要一次__syncthreads()即可 + // 3)由于GPU不能向CPU那样支持乱序执行,主循环中需要先将下一次循环计算需要的Gloabal Memory中的数据load + // 到寄存器,然后进行本次计算,之后再将load到寄存器中的数据写到Shared Memory,这样在LDG指令向Global + // Memory做load时,不会影响后续FFMA及其它运算指令的 launch 执行,也就达到了Double Buffering的目的。 + + // bk = 0 is loading here, buffer 0 + { + int load_a_gmem_k = load_a_smem_k; + int load_a_gmem_addr = load_a_gmem_m * K + load_a_gmem_k; + int load_b_gmem_k = load_b_smem_k; + int load_b_gmem_addr = load_b_gmem_k * N + load_b_gmem_n; + LDST64BITS(r_load_a[0]) = LDST64BITS(a[load_a_gmem_addr]); + LDST64BITS(r_load_b[0]) = LDST64BITS(b[load_b_gmem_addr]); + + s_a[0][load_a_smem_k + 0][load_a_smem_m] = r_load_a[0]; + s_a[0][load_a_smem_k + 1][load_a_smem_m] = r_load_a[1]; + s_a[0][load_a_smem_k + 2][load_a_smem_m] = r_load_a[2]; + s_a[0][load_a_smem_k + 3][load_a_smem_m] = r_load_a[3]; + LDST64BITS(s_b[0][load_b_smem_k][load_b_smem_n]) = LDST64BITS(r_load_b[0]); + } + // Without this synchronization, accuracy may occasionally be abnormal. + __syncthreads(); + + // bk start from 1 + for (int bk = 1; bk < (K + BK - 1) / BK; bk++) { + + int smem_sel = (bk - 1) & 1; // bk 1->0, bk 2->1, bk 3->0, ... + int smem_sel_next = bk & 1; // bk 1->1, bk 2->0, bk 3->1, ... + + int load_a_gmem_k = bk * BK + load_a_smem_k; + int load_a_gmem_addr = load_a_gmem_m * K + load_a_gmem_k; + int load_b_gmem_k = bk * BK + load_b_smem_k; + int load_b_gmem_addr = load_b_gmem_k * N + load_b_gmem_n; + LDST64BITS(r_load_a[0]) = LDST64BITS(a[load_a_gmem_addr]); + LDST64BITS(r_load_b[0]) = LDST64BITS(b[load_b_gmem_addr]); + + #pragma unroll + for (int tk = 0; tk < BK; tk++) { + LDST128BITS(r_comp_a[0]) = LDST128BITS(s_a[smem_sel][tk][ty * TM]); + LDST128BITS(r_comp_b[0]) = LDST128BITS(s_b[smem_sel][tk][tx * TN]); + + #pragma unroll + for (int tm = 0; tm < TM; tm++) { + #pragma unroll + for (int tn = 0; tn < TN; tn++) { + r_c[tm][tn] = __hfma(r_comp_a[tm], r_comp_b[tn], r_c[tm][tn]); + } + } + } + + s_a[smem_sel_next][load_a_smem_k + 0][load_a_smem_m] = r_load_a[0]; + s_a[smem_sel_next][load_a_smem_k + 1][load_a_smem_m] = r_load_a[1]; + s_a[smem_sel_next][load_a_smem_k + 2][load_a_smem_m] = r_load_a[2]; + s_a[smem_sel_next][load_a_smem_k + 3][load_a_smem_m] = r_load_a[3]; + LDST128BITS(s_b[smem_sel_next][load_b_smem_k][load_b_smem_n]) = LDST128BITS(r_load_b[0]); + + __syncthreads(); + } + + // buffer 1 + #pragma unroll + for (int tk = 0; tk < BK; tk++) { + LDST128BITS(r_comp_a[0]) = LDST128BITS(s_a[1][tk][ty * TM]); + LDST128BITS(r_comp_b[0]) = LDST128BITS(s_b[1][tk][tx * TN]); + + #pragma unroll + for (int tm = 0; tm < TM; tm++) { + #pragma unroll + for (int tn = 0; tn < TN; tn++) { + r_c[tm][tn] = __hfma(r_comp_a[tm], r_comp_b[tn], r_c[tm][tn]); + } + } + } + + #pragma unroll + for (int i = 0; i < TM; i++) { + int store_c_gmem_m = by * BM + ty * TM + i; + int store_c_gmem_n = bx * BN + tx * TN; + int store_c_gmem_addr = store_c_gmem_m * N + store_c_gmem_n; + LDST128BITS(c[store_c_gmem_addr]) = LDST128BITS(r_c[i][0]); + } +} // --------------------- PyTorch bindings for custom kernel ----------------------- #define STRINGFY(str) #str @@ -1022,6 +1149,35 @@ void hgemm_t_4x4_sliced_k_f16x4_pack_bcf_offset(torch::Tensor a, torch::Tensor b ); } +// double buffers +void hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf(torch::Tensor a, torch::Tensor b, torch::Tensor c) { + CHECK_TORCH_TENSOR_DTYPE(a, torch::kHalf) + CHECK_TORCH_TENSOR_DTYPE(b, torch::kHalf) + CHECK_TORCH_TENSOR_DTYPE(c, torch::kHalf) + const int M = a.size(0); + const int K = a.size(1); + const int N = b.size(1); + CHECK_TORCH_TENSOR_SHAPE(a, M, K) + CHECK_TORCH_TENSOR_SHAPE(b, K, N) + CHECK_TORCH_TENSOR_SHAPE(c, M, N) + constexpr int BM = 128; + constexpr int BN = 128; + constexpr int BK = 8; + constexpr int TM = 8; + constexpr int TN = 8; + + dim3 block(BN/TN, BM/TM); + dim3 grid((N + BN - 1) / BN, (M + BM - 1) / BM); + + hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf_kernel<<>>( + reinterpret_cast(a.data_ptr()), + reinterpret_cast(b.data_ptr()), + reinterpret_cast(c.data_ptr()), + M, N, K + ); +} + + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { TORCH_BINDING_COMMON_EXTENSION(hgemm_naive_f16) TORCH_BINDING_COMMON_EXTENSION(hgemm_sliced_k_f16) @@ -1034,4 +1190,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { TORCH_BINDING_COMMON_EXTENSION(hgemm_t_8x8_sliced_k_f16x4_pack_bcf_offset) TORCH_BINDING_COMMON_EXTENSION(hgemm_t_8x8_sliced_k_f16x8_pack_bcf) TORCH_BINDING_COMMON_EXTENSION(hgemm_t_8x8_sliced_k_f16x8_pack_bcf_offset) + TORCH_BINDING_COMMON_EXTENSION(hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf) } diff --git a/hgemm/hgemm.py b/hgemm/hgemm.py index 1d0acb7c..31e06378 100644 --- a/hgemm/hgemm.py +++ b/hgemm/hgemm.py @@ -76,9 +76,10 @@ def run_benchmark(perf_func: callable, run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x4_bcf, a, b, "f16x4(t8x8bcf)", c) run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x4_pack, a, b, "f16x4pack(t8x8sk)", c) run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x4_pack_bcf, a, b, "f16x4pack(bcf)", c) - run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x4_pack_bcf_offset, a, b, "f16x4pack(offset)", c) + run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x4_pack_bcf_offset, a, b, "f16x4pack(bcf+offset)", c) run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf, a, b, "f16x8pack(bcf)", c) - run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf_offset, a, b, "f16x8pack(offset)", c) + run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf_offset, a, b, "f16x8pack(bcf+offset)", c) + run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf, a, b, "f16x8pack(dbuf)", c) run_benchmark(partial(torch.matmul, out=c), a, b, "f16_th") print("-" * 110)