-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Improve Math.BigMul on x64 by adding new internal Multiply
hardware intrinsic to X86Base
#115966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
4ed65b8
to
2f0f838
Compare
Due to conflict with new changes in main I had to rename the X86 method to BigMul |
I've decided to push the bmi2 / mulx support and updated the "BigMul" test results. However i can remove the mulx part to to a follow up PR make review/testing easier. |
@jakobbotsch it seems you was not notified about this PR either (it was made before mulx for GT_MULHI, #116198 ) which you just reviewed. I've made some minor changes based on that feedback |
eac6624
to
bfa9de4
Compare
{ | ||
isRMW = false; | ||
|
||
SingleTypeRegSet apxAwareRegCandidates = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI: I have a separate commit update at that adds EDX as fixed register for return value instead of operand.
It seems to give slightly better PerfScore for XxHashShared:MergeAccumulators, but seems a bit backwards to specify target register instead of source. Also the code difference might be fixed by future improvements to register allocator instead.
How do you feel about it? i am not sure if it is better or not
generated assembly
With commit that fix return value in rdx:
; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mov rdx, r10
mulx rax, rdx, rax
xor rax, rdx
add rax, r8
mov rdx, qword ptr [rcx+0x10]
mov r8, qword ptr [rsp+0x10]
xor rdx, qword ptr [r8+0x10]
mov r10, qword ptr [rcx+0x18]
xor r10, qword ptr [r8+0x18]
mulx r10, rdx, r10
xor r10, rdx
add rax, r10
mov rdx, qword ptr [rcx+0x20]
xor rdx, qword ptr [r8+0x20]
mov r10, qword ptr [rcx+0x28]
xor r10, qword ptr [r8+0x28]
mulx r10, rdx, r10
xor r10, rdx
add rax, r10
mov rdx, qword ptr [rcx+0x30]
xor rdx, qword ptr [r8+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [r8+0x38]
mulx rcx, rdx, rcx
xor rcx, rdx
add rax, rcx
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=153 bbWeight=1 PerfScore 60.50
G_M1045_IG03: ;; offset=0x0099
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 154
Without commit
; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mov rdx, r10
mulx r10, rax, rax
xor rax, r10
add rax, r8
mov r8, qword ptr [rcx+0x10]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x10]
mov r10, qword ptr [rcx+0x18]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x18]
mov rdx, r10
mulx r10, r8, r8
xor r8, r10
add rax, r8
mov r8, qword ptr [rcx+0x20]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x20]
mov r10, qword ptr [rcx+0x28]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x28]
mov rdx, r10
mulx r10, r8, r8
xor r8, r10
add rax, r8
mov r8, qword ptr [rcx+0x30]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [rdx+0x38]
mov rdx, rcx
mulx rdx, rcx, r8
xor rcx, rdx
add rax, rcx
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=182 bbWeight=1 PerfScore 65.25
G_M1045_IG03: ;; offset=0x00B6
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 183
Without AVX2 (mul only)
; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x10]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x10]
mov r10, qword ptr [rcx+0x18]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x18]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x20]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x20]
mov r10, qword ptr [rcx+0x28]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x28]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x30]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [rdx+0x38]
mul rdx:rax, rcx
xor rax, rdx
add rax, r8
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=162 bbWeight=1 PerfScore 64.25
G_M1045_IG03: ;; offset=0x00A2
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 163
bfa9de4
to
7db8372
Compare
cc @dotnet/jit-contrib |
The biggest improvements are signed long and for platforms without BMI2.
A nice side effect is that the ready2run code can now emit a simple mul instead of having to fallback to the 32bit code.
This pull request introduces a internal
Multiply
hardware intrinsics (NI_X86Base_Multiply
andNI_X86Base_X64_Multiply
) for x86 and x64 architectures in the JIT compiler and calls them fromMath.BigMul
This improves the machine code for signed BigMul which should fix #75594 based on the API shape suggested in #58263
It can also help with implementing IntPtr.BigMul #114731
NOTES:
The code is heavily based on the DivRem code introduced in Implement DivRem intrinsic for X86 #66551 (I went through the current version of all the files touched and tried to add similar code for multiply).
I did not do Mono so I did try to use conditional compilation to exclude it from Mono (since it does not seem as straightforward and I do not know how to test the various combinations). Also it seems like it might already has special cases for bigmul
I have not tuched the jit compiler before, so while the code executes and seems to work fine i might have missed something.
Since it uses tuples it has some of the downsides of DivRem (especially on windows) where extra temp variables and stackspill, so there might be a few scenarios where performance is slighly worse or the same. (There was some discussion in Consume DivRem intrinsics from Math.DivRem #82194 )
There might be other better solutions including special handing Math.BigMul, but that would probably to many new changes to the JIT for me to take on
Exampels of generated code code
Produces the following
WIth BMI2 (mulx)
code before
Further code samples with array access
Benchmarks
The Full Benchmark code is found here
The benchmarks are based on a becnhmark suggested for MultplyNoFlags below does the following
Gnerated code with DOTNET_EnableBMI2=1
Generated code with DOTNET_EnableAVX2=0
A single push to the stack and several reads/writes since rax is spilled.
Baseline: Calling old MultiplyNoFlags
Results for Math.Bigmul with BMI2
UPDATED measurements
Hardware without BMI2, "~10 times faster"
Additional benchmarks results
Additional resutls can be found under https://github.com/Daniel-Svensson/ClrExperiments/tree/7acd61943336356fa363763914a5b963de962065/ClrDecimal/Benchmarks/BenchmarkDotNet.Artifacts/results , I mostly checked that there was no significant regressions to decimal performance since Math.BigMul is has several usages there. There were a few minor improvements, mostly in the composite "InterestBenchmarks" which contains a mix of operations similar to interest calculation.
Copilot Summary
Summary
JIT Compiler Enhancements
Multiply
intrinsics in the JIT compiler, including updates toContainCheckHWIntrinsic
,BuildHWIntrinsic
, andimpSpecialIntrinsic
to handle the new instructions and their constraints (src/coreclr/jit/lowerxarch.cpp
,src/coreclr/jit/lsraxarch.cpp
,src/coreclr/jit/hwintrinsicxarch.cpp
). [1] [2] [3]HWIntrinsicInfo
andGenTreeHWIntrinsic
to include theMultiply
intrinsics and their associated properties (src/coreclr/jit/hwintrinsic.h
,src/coreclr/jit/gentree.cpp
). [1] [2]hwintrinsiclistxarch.h
to define theMultiply
intrinsics and their characteristics, such as instruction mapping and flags (src/coreclr/jit/hwintrinsiclistxarch.h
). [1] [2]Runtime Library Updates
X86Base.Multiply
methods for both signed and unsigned multiplication in the runtime intrinsics API, providing platform-specific implementations or fallback behavior (src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
,src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.PlatformNotSupported.cs
). [1] [2]Math
class to use the newMultiply
intrinsics for optimizedBigMul
operations, improving performance on supported platforms (src/libraries/System.Private.CoreLib/src/System/Math.cs
). [1] [2]Code Cleanup
Math
class (src/libraries/System.Private.CoreLib/src/System/Math.cs
). [1] [2]These changes collectively enhance the performance and capabilities of multiplication operations in .NET, leveraging hardware acceleration where available.Summary: