-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Improve Math.BigMul on x64 by adding new internal Multiply
hardware intrinsic to X86Base
#115966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
4ed65b8
to
2f0f838
Compare
Due to conflict with new changes in main I had to rename the X86 method to BigMul |
I've decided to push the bmi2 / mulx support and updated the "BigMul" test results. However i can remove the mulx part to to a follow up PR make review/testing easier. |
@jakobbotsch it seems you was not notified about this PR either (it was made before mulx for GT_MULHI, #116198 ) which you just reviewed. I've made some minor changes based on that feedback |
eac6624
to
bfa9de4
Compare
{ | ||
isRMW = false; | ||
|
||
SingleTypeRegSet apxAwareRegCandidates = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI: I have a separate commit update at that adds EDX as fixed register for return value instead of operand.
It seems to give slightly better PerfScore for XxHashShared:MergeAccumulators, but seems a bit backwards to specify target register instead of source. Also the code difference might be fixed by future improvements to register allocator instead.
How do you feel about it? i am not sure if it is better or not
generated assembly
With commit that fix return value in rdx:
; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mov rdx, r10
mulx rax, rdx, rax
xor rax, rdx
add rax, r8
mov rdx, qword ptr [rcx+0x10]
mov r8, qword ptr [rsp+0x10]
xor rdx, qword ptr [r8+0x10]
mov r10, qword ptr [rcx+0x18]
xor r10, qword ptr [r8+0x18]
mulx r10, rdx, r10
xor r10, rdx
add rax, r10
mov rdx, qword ptr [rcx+0x20]
xor rdx, qword ptr [r8+0x20]
mov r10, qword ptr [rcx+0x28]
xor r10, qword ptr [r8+0x28]
mulx r10, rdx, r10
xor r10, rdx
add rax, r10
mov rdx, qword ptr [rcx+0x30]
xor rdx, qword ptr [r8+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [r8+0x38]
mulx rcx, rdx, rcx
xor rcx, rdx
add rax, rcx
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=153 bbWeight=1 PerfScore 60.50
G_M1045_IG03: ;; offset=0x0099
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 154
Without commit
; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mov rdx, r10
mulx r10, rax, rax
xor rax, r10
add rax, r8
mov r8, qword ptr [rcx+0x10]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x10]
mov r10, qword ptr [rcx+0x18]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x18]
mov rdx, r10
mulx r10, r8, r8
xor r8, r10
add rax, r8
mov r8, qword ptr [rcx+0x20]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x20]
mov r10, qword ptr [rcx+0x28]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x28]
mov rdx, r10
mulx r10, r8, r8
xor r8, r10
add rax, r8
mov r8, qword ptr [rcx+0x30]
mov rdx, qword ptr [rsp+0x10]
xor r8, qword ptr [rdx+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [rdx+0x38]
mov rdx, rcx
mulx rdx, rcx, r8
xor rcx, rdx
add rax, rcx
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=182 bbWeight=1 PerfScore 65.25
G_M1045_IG03: ;; offset=0x00B6
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 183
Without AVX2 (mul only)
; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M1045_IG02: ;; offset=0x0000
mov rax, qword ptr [rcx]
xor rax, qword ptr [rdx]
mov r10, qword ptr [rcx+0x08]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x08]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x10]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x10]
mov r10, qword ptr [rcx+0x18]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x18]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x20]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x20]
mov r10, qword ptr [rcx+0x28]
mov qword ptr [rsp+0x10], rdx
xor r10, qword ptr [rdx+0x28]
mul rdx:rax, r10
xor rax, rdx
add r8, rax
mov rax, qword ptr [rcx+0x30]
mov rdx, qword ptr [rsp+0x10]
xor rax, qword ptr [rdx+0x30]
mov rcx, qword ptr [rcx+0x38]
xor rcx, qword ptr [rdx+0x38]
mul rdx:rax, rcx
xor rax, rdx
add rax, r8
mov rcx, rax
shr rcx, 37
xor rcx, rax
mov rax, 0x165667919E3779F9
imul rax, rcx
mov rcx, rax
shr rcx, 32
xor rax, rcx
;; size=162 bbWeight=1 PerfScore 64.25
G_M1045_IG03: ;; offset=0x00A2
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 163
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@EgorBo do you have any preference on which approach to take here? (Hardcode rdx as return register or not)
The main advantage with the other approach as i see it is that the code, and generated code for mul and mulx becomes more similar.
I will try and clarify the commen for the swap and fix the conflict in a day or two and would like to know if there is anything else to change.
bfa9de4
to
7db8372
Compare
cc @dotnet/jit-contrib |
I've opened up #117261 that only contains the first part without mulx support of this PR to ensure that all tests still pass without mulx support. If you want to review and merge everything at once (this PR) then feel free to close the other PR. As for mulx I am slightly in favor of making the change #115966 (comment) since it would work more similar to how mul behaves (but it trashes only edx instead of 2 fixed registers). |
@EgorBo, PTAL. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR introduces optimized x86/x64 hardware intrinsics for BigMul operations to improve performance of Math.BigMul
methods, particularly for signed multiplication and on platforms without BMI2. The implementation follows the pattern established by the existing DivRem intrinsics.
Key changes:
- Adds internal
BigMul
hardware intrinsics toX86Base
for both 32-bit and 64-bit operations - Updates
Math.BigMul
methods to use the new intrinsics when available on x86/x64 platforms - Implements comprehensive JIT compiler support for code generation, register allocation, and optimization
Reviewed Changes
Copilot reviewed 10 out of 10 changed files in this pull request and generated 8 comments.
Show a summary per file
File | Description |
---|---|
X86Base.cs | Added internal BigMul intrinsic methods for various integer types with appropriate documentation |
X86Base.PlatformNotSupported.cs | Added platform not supported stubs for the new BigMul intrinsics |
Math.cs | Updated BigMul implementations to use new intrinsics on supported platforms, with conditional compilation for optimization |
lsraxarch.cpp | Implemented register allocation logic for BigMul intrinsics with BMI2 and non-BMI2 code paths |
lowerxarch.cpp | Added containment check support for BigMul intrinsics |
hwintrinsicxarch.cpp | Added import handling for BigMul intrinsics with multi-register return support |
hwintrinsiclistxarch.h | Defined BigMul intrinsic metadata and instruction mappings |
hwintrinsiccodegenxarch.cpp | Implemented code generation for BigMul with MULX and MUL/IMUL instruction variants |
hwintrinsic.h | Added BigMul intrinsics to multi-register return count logic |
gentree.cpp | Added layout support for BigMul intrinsic return types |
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
Show resolved
Hide resolved
/// <remarks> | ||
/// <para>Its functionality is exposed by the public <see cref="Math.BigMul(long, long, out long)" />.</para> | ||
/// </remarks> | ||
internal static (long Lower, long Upper) BigMul(long left, long right) => BigMul(left, right); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method has infinite recursion. The method calls itself with the same parameters, which will cause a stack overflow. It should likely call the platform-specific implementation or throw a not implemented exception.
internal static (long Lower, long Upper) BigMul(long left, long right) => BigMul(left, right); | |
internal static (long Lower, long Upper) BigMul(long left, long right) => throw new PlatformNotSupportedException(); |
Copilot uses AI. Check for mistakes.
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
Show resolved
Hide resolved
/// <remarks>Intented for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks> | ||
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) => BigMul(left, right); | ||
|
||
/// <summary> IMUL reg/m</summary> | ||
/// <remarks>Intented for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo in 'Intented' should be 'Intended'.
/// <remarks>Intented for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks> | |
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) => BigMul(left, right); | |
/// <summary> IMUL reg/m</summary> | |
/// <remarks>Intented for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks> | |
/// <remarks>Intended for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks> | |
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) => BigMul(left, right); | |
/// <summary> IMUL reg/m</summary> | |
/// <remarks>Intended for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks> |
Copilot uses AI. Check for mistakes.
internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) { throw new PlatformNotSupportedException(); } | ||
|
||
/// <summary> IMUL reg/m</summary> | ||
internal static (nint Lower, nint Upper) BigMul(nint left, nint right) { throw new PlatformNotSupportedException(); } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Daniel-Svensson sorry for the delayed response, this fell off my radar due to .NET 10.0 release work, could you please resolve the merge conflicts and Copilot's reviews? |
The biggest improvements are signed long and for platforms without BMI2.
A nice side effect is that the ready2run code can now emit a simple mul instead of having to fallback to the 32bit code.
This pull request introduces a internal
Multiply
hardware intrinsics (NI_X86Base_Multiply
andNI_X86Base_X64_Multiply
) for x86 and x64 architectures in the JIT compiler and calls them fromMath.BigMul
This improves the machine code for signed BigMul which should fix #75594 based on the API shape suggested in #58263
It can also help with implementing IntPtr.BigMul #114731
NOTES:
The code is heavily based on the DivRem code introduced in Implement DivRem intrinsic for X86 #66551 (I went through the current version of all the files touched and tried to add similar code for multiply).
I did not do Mono so I did try to use conditional compilation to exclude it from Mono (since it does not seem as straightforward and I do not know how to test the various combinations). Also it seems like it might already has special cases for bigmul
I have not tuched the jit compiler before, so while the code executes and seems to work fine i might have missed something.
Since it uses tuples it has some of the downsides of DivRem (especially on windows) where extra temp variables and stackspill, so there might be a few scenarios where performance is slighly worse or the same. (There was some discussion in Consume DivRem intrinsics from Math.DivRem #82194 )
There might be other better solutions including special handing Math.BigMul, but that would probably to many new changes to the JIT for me to take on
Exampels of generated code code
Produces the following
WIth BMI2 (mulx)
code before
Further code samples with array access
Benchmarks
The Full Benchmark code is found here
The benchmarks are based on a becnhmark suggested for MultplyNoFlags below does the following
Gnerated code with DOTNET_EnableBMI2=1
Generated code with DOTNET_EnableAVX2=0
A single push to the stack and several reads/writes since rax is spilled.
Baseline: Calling old MultiplyNoFlags
Results for Math.Bigmul with BMI2
UPDATED measurements
Hardware without BMI2, "~10 times faster"
Additional benchmarks results
Additional resutls can be found under https://github.com/Daniel-Svensson/ClrExperiments/tree/7acd61943336356fa363763914a5b963de962065/ClrDecimal/Benchmarks/BenchmarkDotNet.Artifacts/results , I mostly checked that there was no significant regressions to decimal performance since Math.BigMul is has several usages there. There were a few minor improvements, mostly in the composite "InterestBenchmarks" which contains a mix of operations similar to interest calculation.
Copilot Summary
Summary
JIT Compiler Enhancements
Multiply
intrinsics in the JIT compiler, including updates toContainCheckHWIntrinsic
,BuildHWIntrinsic
, andimpSpecialIntrinsic
to handle the new instructions and their constraints (src/coreclr/jit/lowerxarch.cpp
,src/coreclr/jit/lsraxarch.cpp
,src/coreclr/jit/hwintrinsicxarch.cpp
). [1] [2] [3]HWIntrinsicInfo
andGenTreeHWIntrinsic
to include theMultiply
intrinsics and their associated properties (src/coreclr/jit/hwintrinsic.h
,src/coreclr/jit/gentree.cpp
). [1] [2]hwintrinsiclistxarch.h
to define theMultiply
intrinsics and their characteristics, such as instruction mapping and flags (src/coreclr/jit/hwintrinsiclistxarch.h
). [1] [2]Runtime Library Updates
X86Base.Multiply
methods for both signed and unsigned multiplication in the runtime intrinsics API, providing platform-specific implementations or fallback behavior (src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs
,src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.PlatformNotSupported.cs
). [1] [2]Math
class to use the newMultiply
intrinsics for optimizedBigMul
operations, improving performance on supported platforms (src/libraries/System.Private.CoreLib/src/System/Math.cs
). [1] [2]Code Cleanup
Math
class (src/libraries/System.Private.CoreLib/src/System/Math.cs
). [1] [2]These changes collectively enhance the performance and capabilities of multiplication operations in .NET, leveraging hardware acceleration where available.Summary: