Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Daniel-Svensson · 2025-05-24T20:49:34Z

The biggest improvements are signed long and for platforms without BMI2.
A nice side effect is that the ready2run code can now emit a simple mul instead of having to fallback to the 32bit code.

This pull request introduces a internal Multiply hardware intrinsics (NI_X86Base_Multiply and NI_X86Base_X64_Multiply) for x86 and x64 architectures in the JIT compiler and calls them from Math.BigMul

This improves the machine code for signed BigMul which should fix #75594 based on the API shape suggested in #58263
It can also help with implementing IntPtr.BigMul #114731

NOTES:

The code is heavily based on the DivRem code introduced in Implement DivRem intrinsic for X86 #66551 (I went through the current version of all the files touched and tried to add similar code for multiply).
I did not do Mono so I did try to use conditional compilation to exclude it from Mono (since it does not seem as straightforward and I do not know how to test the various combinations). Also it seems like it might already has special cases for bigmul
I have not tuched the jit compiler before, so while the code executes and seems to work fine i might have missed something.
Since it uses tuples it has some of the downsides of DivRem (especially on windows) where extra temp variables and stackspill, so there might be a few scenarios where performance is slighly worse or the same. (There was some discussion in Consume DivRem intrinsics from Math.DivRem #82194 )
There might be other better solutions including special handing Math.BigMul, but that would probably to many new changes to the JIT for me to take on

Exampels of generated code code

[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
static void TestBigMul2(ref ulong x, ref ulong y)
{
    x = Math.BigMul(x, y, out y);
}

[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
static void TestBigMul1(ref long x, ref long y)
{
    x = Math.BigMul(x, y, out y);
}

Produces the following

; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M36427_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M36427_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       mul      rdx:rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=22 bbWeight=1 PerfScore 12.00

G_M36427_IG03:  ;; offset=0x0016
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 23



; Method Program:<<Main>$>g__TestBigMul1|0_2(byref,byref) (FullOpts)
G_M20175_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M20175_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       imul     rdx:rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=22 bbWeight=1 PerfScore 12.00

G_M20175_IG03:  ;; offset=0x0016
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 23

WIth BMI2 (mulx)

; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M36427_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M36427_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       mov      rdx, rax
       mulx     rdx, rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=27 bbWeight=1 PerfScore 12.25

G_M36427_IG03:  ;; offset=0x001B
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 28

code before

; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M000_IG01:                ;; offset=0x0000
       push     rax
       mov      bword ptr [rsp+0x18], rdx

G_M000_IG02:                ;; offset=0x0006
       mov      rdx, qword ptr [rcx]
       mov      rax, bword ptr [rsp+0x18]
       mov      r8, qword ptr [rax]
       lea      r10, [rsp]
       mulx     rdx, r9, r8
       mov      qword ptr [r10], r9
       mov      r8, qword ptr [rsp]
       mov      qword ptr [rax], r8
       mov      qword ptr [rcx], rdx

G_M000_IG03:                ;; offset=0x0027
       add      rsp, 8
       ret      
; Total bytes of code: 44

; Assembly listing for method Program:<<Main>$>g__TestBigMul1|0_6(byref,byref) (FullOpts)
G_M000_IG01:                ;; offset=0x0000
       push     rax
 
G_M000_IG02:                ;; offset=0x0001
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x18], rdx
       mov      r8, qword ptr [rdx]
       lea      r10, [rsp]
       mov      rdx, rax
       mulx     rdx, r9, r8
       mov      qword ptr [r10], r9
       mov      r10, qword ptr [rsp]
       mov      r9, bword ptr [rsp+0x18]
       mov      qword ptr [r9], r10
       mov      r10, rax
       sar      r10, 63
       and      r10, r8
       sub      rdx, r10
       sar      r8, 63
       and      rax, r8
       sub      rdx, rax
       mov      qword ptr [rcx], rdx
 
G_M000_IG03:                ;; offset=0x0041
       add      rsp, 8
       ret      
 
; Total bytes of code 70

Further code samples with array access

static long TestBigMulArr2(long[] x, ref long y)
{
    return Math.BigMul(y, x[1], out y);
}

				
static void TestBigMulArr12(long[] x, ref long y)
{
    x[1] = Math.BigMul(y, x[1], out y);
}

; Method Program:<<Main>$>g__TestBigMulArr2|0_9(long[],byref):long (FullOpts)
G_M31792_IG01:  ;; offset=0x0000
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25

G_M31792_IG02:  ;; offset=0x0004
       mov      bword ptr [rsp+0x38], rdx
       mov      rax, qword ptr [rdx]
       cmp      dword ptr [rcx+0x08], 1
       jbe      SHORT G_M31792_IG04
       imul     rdx:rax, qword ptr [rcx+0x18]
       mov      rcx, bword ptr [rsp+0x38]
       mov      qword ptr [rcx], rax
       mov      rax, rdx
						;; size=29 bbWeight=1 PerfScore 15.25

G_M31792_IG03:  ;; offset=0x0021
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25

G_M31792_IG04:  ;; offset=0x0026
       call     CORINFO_HELP_RNGCHKFAIL
       int3     
						;; size=6 bbWeight=0 PerfScore 0.00
; Total bytes of code: 44


; Method Program:<<Main>$>g__TestBigMulArr12|0_10(long[],byref) (FullOpts)
G_M43177_IG01:  ;; offset=0x0000
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25

G_M43177_IG02:  ;; offset=0x0004
       mov      bword ptr [rsp+0x38], rdx
       mov      rax, qword ptr [rdx]
       mov      r8d, dword ptr [rcx+0x08]
       cmp      r8d, 1
       jbe      SHORT G_M43177_IG04
       imul     rdx:rax, qword ptr [rcx+0x18]
       mov      r8, bword ptr [rsp+0x38]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx+0x18], rdx
						;; size=34 bbWeight=1 PerfScore 15.25

G_M43177_IG03:  ;; offset=0x0026
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25

G_M43177_IG04:  ;; offset=0x002B
       call     CORINFO_HELP_RNGCHKFAIL
       int3     
						;; size=6 bbWeight=0 PerfScore 0.00
; Total bytes of code: 49

Benchmarks

The Full Benchmark code is found here

The benchmarks are based on a becnhmark suggested for MultplyNoFlags below does the following

        [Benchmark]
        public ulong BenchBigMulUnsigned()
        {
            ulong accLo = TestA;
            ulong accHi = TestB;
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            return accLo + accHi;
        }
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private unsafe void MathBigMulAcc(ulong a, ulong b, ref ulong accHi, ref ulong accLo)
        {
            ulong lo;
            ulong hi = Math.BigMul(a, b, out lo);
            accHi += hi;
            accLo += lo;
        }

Gnerated code with DOTNET_EnableBMI2=1

; Method Benchmarks.Scenarios.BigMulTests:BenchBigMulUnsigned():ulong:this (FullOpts)
G_M56495_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M56495_IG02:  ;; offset=0x0000
       mov      rdx, qword ptr [rcx+0x08]
       mov      rax, qword ptr [rcx+0x10]
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rcx, rdx
       add      rax, rcx
       add      rax, r8
						;; size=55 bbWeight=1 PerfScore 18.25

G_M56495_IG03:  ;; offset=0x0037
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 56

Generated code with DOTNET_EnableAVX2=0

A single push to the stack and several reads/writes since rax is spilled.

; Method Benchmarks.Scenarios.BigMulTests:BenchBigMulUnsigned():ulong:this (FullOpts)
G_M56495_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M56495_IG02:  ;; offset=0x0000
       mov      r8, qword ptr [rcx+0x08]
       mov      rcx, qword ptr [rcx+0x10]
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rcx, rdx
       add      r8, rax
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rcx, rdx
       add      r8, rax
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rcx, rdx
       add      r8, rax
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rax, r8
       add      rax, rcx
       add      rax, rdx
						;; size=59 bbWeight=1 PerfScore 19.25

G_M56495_IG03:  ;; offset=0x003B
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 60

Baseline: Calling old MultiplyNoFlags

; Method Benchmarks.Scenarios.BigMulTests:BenchMultiplyNoFlags3Ards():ulong:this (FullOpts)
G_M20411_IG01:  ;; offset=0x0000
       sub      rsp, 40
       xor      eax, eax
       mov      qword ptr [rsp+0x08], rax
       vxorps   xmm4, xmm4, xmm4
       vmovdqa  xmmword ptr [rsp+0x10], xmm4
       mov      qword ptr [rsp+0x20], rax
						;; size=26 bbWeight=1 PerfScore 4.83

G_M20411_IG02:  ;; offset=0x001A
       mov      rdx, qword ptr [rcx+0x08]
       mov      rax, qword ptr [rcx+0x10]
       lea      rcx, [rsp+0x20]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x20]
       lea      rcx, [rsp+0x18]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x18]
       lea      rcx, [rsp+0x10]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x10]
       lea      rcx, [rsp+0x08]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       mov      rcx, rdx
       add      rcx, qword ptr [rsp+0x08]
       add      rax, rcx
       add      rax, r10
						;; size=98 bbWeight=1 PerfScore 31.50

G_M20411_IG03:  ;; offset=0x007C
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25
; Total bytes of code: 129

Results for Math.Bigmul with BMI2


BenchmarkDotNet v0.14.0, Windows 11 (10.0.26100.4202)
AMD Ryzen 7 5800X, 1 CPU, 16 logical and 8 physical cores
.NET SDK 10.0.100-preview.3.25201.16
  [Host]     : .NET 10.0.0 (10.0.25.17105), X64 RyuJIT AVX2
  Job-ZAWDHL : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX2
  Job-RAEHQD : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX2

Method	Job	Toolchain	TestA	TestB	Mean	Error	StdDev	Ratio
BenchBigMulUnsigned	Job-ZAWDHL	\net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	0.6420 ns	0.0065 ns	0.0061 ns	0.42
BenchBigMulUnsigned	Job-RAEHQD	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.5354 ns	0.0141 ns	0.0125 ns	1.00

BenchBigMulSigned	Job-ZAWDHL	\net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.2853 ns	0.0070 ns	0.0065 ns	0.43
BenchBigMulSigned	Job-RAEHQD	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	2.9852 ns	0.0263 ns	0.0246 ns	1.00

BenchMultiplyNoFlags3Ards	Job-ZAWDHL	\net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	3.3122 ns	0.0060 ns	0.0056 ns	1.00
BenchMultiplyNoFlags3Ards	Job-RAEHQD	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	3.3080 ns	0.0058 ns	0.0054 ns	1.00

UPDATED measurements

Method	Job	Toolchain	TestA	TestB	Mean	Error	StdDev	Ratio	RatioSD
BenchBigMulUnsigned	Job-QMWUGV	CoreRun	81985529216486895	16045690984833335023	0.8473 ns	0.0074 ns	0.0069 ns	1.00	0.01
BenchBigMulSigned	Job-QMWUGV	CoreRun	81985529216486895	16045690984833335023	0.8525 ns	0.0107 ns	0.0095 ns	1.00	0.02

Hardware without BMI2, "~10 times faster"

Method	Job	Toolchain	TestA	TestB	Mean	Error	StdDev	Ratio	RatioSD
BenchBigMulUnsigned	Job-JCYSGS	\net10.0-windows-Release-x64_MathInstrinct\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.283 ns	0.0104 ns	0.0092 ns	0.10	0.00
BenchBigMulUnsigned	Job-SJSTKO	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	12.256 ns	0.0133 ns	0.0118 ns	1.00	0.00

BenchBigMulSigned	Job-JCYSGS	\net10.0-windows-Release-x64_MathInstrinct\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	1.275 ns	0.0051 ns	0.0048 ns	0.12	0.00
BenchBigMulSigned	Job-SJSTKO	\net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe	81985529216486895	16045690984833335023	10.783 ns	0.0743 ns	0.0620 ns	1.00	0.00

Additional benchmarks results

Additional resutls can be found under https://github.com/Daniel-Svensson/ClrExperiments/tree/7acd61943336356fa363763914a5b963de962065/ClrDecimal/Benchmarks/BenchmarkDotNet.Artifacts/results , I mostly checked that there was no significant regressions to decimal performance since Math.BigMul is has several usages there. There were a few minor improvements, mostly in the composite "InterestBenchmarks" which contains a mix of operations similar to interest calculation.

Copilot Summary

Summary

JIT Compiler Enhancements

Added support for Multiply intrinsics in the JIT compiler, including updates to ContainCheckHWIntrinsic, BuildHWIntrinsic, and impSpecialIntrinsic to handle the new instructions and their constraints (src/coreclr/jit/lowerxarch.cpp, src/coreclr/jit/lsraxarch.cpp, src/coreclr/jit/hwintrinsicxarch.cpp). [1] [2] [3]
Updated HWIntrinsicInfo and GenTreeHWIntrinsic to include the Multiply intrinsics and their associated properties (src/coreclr/jit/hwintrinsic.h, src/coreclr/jit/gentree.cpp). [1] [2]
Extended hwintrinsiclistxarch.h to define the Multiply intrinsics and their characteristics, such as instruction mapping and flags (src/coreclr/jit/hwintrinsiclistxarch.h). [1] [2]

Runtime Library Updates

Introduced X86Base.Multiply methods for both signed and unsigned multiplication in the runtime intrinsics API, providing platform-specific implementations or fallback behavior (src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs, src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.PlatformNotSupported.cs). [1] [2]
Updated the Math class to use the new Multiply intrinsics for optimized BigMul operations, improving performance on supported platforms (src/libraries/System.Private.CoreLib/src/System/Math.cs). [1] [2]

Code Cleanup

Removed outdated and unused code paths related to older multiplication implementations in the Math class (src/libraries/System.Private.CoreLib/src/System/Math.cs). [1] [2]

These changes collectively enhance the performance and capabilities of multiplication operations in .NET, leveraging hardware acceleration where available.Summary:

src/libraries/System.Private.CoreLib/src/System/Math.cs

src/coreclr/jit/lsraxarch.cpp

Daniel-Svensson · 2025-06-03T15:04:43Z

Due to conflict with new changes in main I had to rename the X86 method to BigMul

Daniel-Svensson · 2025-06-03T22:04:53Z

I've decided to push the bmi2 / mulx support and updated the "BigMul" test results.
There are also some nice improvement to XXHash from this commit, but it might as well be from updating main ?

However i can remove the mulx part to to a follow up PR make review/testing easier.
The generated code without mulx is often smaller and , the additional register usage is not as big deal for x64 as it is for 32bit

src/coreclr/jit/lsraxarch.cpp

Daniel-Svensson · 2025-06-18T21:29:35Z

@jakobbotsch it seems you was not notified about this PR either (it was made before mulx for GT_MULHI, #116198 ) which you just reviewed. I've made some minor changes based on that feedback

Daniel-Svensson · 2025-06-18T21:40:59Z

src/coreclr/jit/lsraxarch.cpp

+                {
+                    isRMW = false;
+
+                    SingleTypeRegSet apxAwareRegCandidates =


FYI: I have a separate commit update at that adds EDX as fixed register for return value instead of operand.

It seems to give slightly better PerfScore for XxHashShared:MergeAccumulators, but seems a bit backwards to specify target register instead of source. Also the code difference might be fixed by future improvements to register allocator instead.

How do you feel about it? i am not sure if it is better or not

generated assembly

With commit that fix return value in rdx:

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts) G_M1045_IG01: ;; offset=0x0000 ;; size=0 bbWeight=1 PerfScore 0.00 G_M1045_IG02: ;; offset=0x0000 mov rax, qword ptr [rcx] xor rax, qword ptr [rdx] mov r10, qword ptr [rcx+0x08] mov qword ptr [rsp+0x10], rdx xor r10, qword ptr [rdx+0x08] mov rdx, r10 mulx rax, rdx, rax xor rax, rdx add rax, r8 mov rdx, qword ptr [rcx+0x10] mov r8, qword ptr [rsp+0x10] xor rdx, qword ptr [r8+0x10] mov r10, qword ptr [rcx+0x18] xor r10, qword ptr [r8+0x18] mulx r10, rdx, r10 xor r10, rdx add rax, r10 mov rdx, qword ptr [rcx+0x20] xor rdx, qword ptr [r8+0x20] mov r10, qword ptr [rcx+0x28] xor r10, qword ptr [r8+0x28] mulx r10, rdx, r10 xor r10, rdx add rax, r10 mov rdx, qword ptr [rcx+0x30] xor rdx, qword ptr [r8+0x30] mov rcx, qword ptr [rcx+0x38] xor rcx, qword ptr [r8+0x38] mulx rcx, rdx, rcx xor rcx, rdx add rax, rcx mov rcx, rax shr rcx, 37 xor rcx, rax mov rax, 0x165667919E3779F9 imul rax, rcx mov rcx, rax shr rcx, 32 xor rax, rcx ;; size=153 bbWeight=1 PerfScore 60.50 G_M1045_IG03: ;; offset=0x0099 ret ;; size=1 bbWeight=1 PerfScore 1.00 ; Total bytes of code: 154

Without commit

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts) G_M1045_IG01: ;; offset=0x0000 ;; size=0 bbWeight=1 PerfScore 0.00 G_M1045_IG02: ;; offset=0x0000 mov rax, qword ptr [rcx] xor rax, qword ptr [rdx] mov r10, qword ptr [rcx+0x08] mov qword ptr [rsp+0x10], rdx xor r10, qword ptr [rdx+0x08] mov rdx, r10 mulx r10, rax, rax xor rax, r10 add rax, r8 mov r8, qword ptr [rcx+0x10] mov rdx, qword ptr [rsp+0x10] xor r8, qword ptr [rdx+0x10] mov r10, qword ptr [rcx+0x18] mov qword ptr [rsp+0x10], rdx xor r10, qword ptr [rdx+0x18] mov rdx, r10 mulx r10, r8, r8 xor r8, r10 add rax, r8 mov r8, qword ptr [rcx+0x20] mov rdx, qword ptr [rsp+0x10] xor r8, qword ptr [rdx+0x20] mov r10, qword ptr [rcx+0x28] mov qword ptr [rsp+0x10], rdx xor r10, qword ptr [rdx+0x28] mov rdx, r10 mulx r10, r8, r8 xor r8, r10 add rax, r8 mov r8, qword ptr [rcx+0x30] mov rdx, qword ptr [rsp+0x10] xor r8, qword ptr [rdx+0x30] mov rcx, qword ptr [rcx+0x38] xor rcx, qword ptr [rdx+0x38] mov rdx, rcx mulx rdx, rcx, r8 xor rcx, rdx add rax, rcx mov rcx, rax shr rcx, 37 xor rcx, rax mov rax, 0x165667919E3779F9 imul rax, rcx mov rcx, rax shr rcx, 32 xor rax, rcx ;; size=182 bbWeight=1 PerfScore 65.25 G_M1045_IG03: ;; offset=0x00B6 ret ;; size=1 bbWeight=1 PerfScore 1.00 ; Total bytes of code: 183

Without AVX2 (mul only)

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts) G_M1045_IG01: ;; offset=0x0000 ;; size=0 bbWeight=1 PerfScore 0.00 G_M1045_IG02: ;; offset=0x0000 mov rax, qword ptr [rcx] xor rax, qword ptr [rdx] mov r10, qword ptr [rcx+0x08] mov qword ptr [rsp+0x10], rdx xor r10, qword ptr [rdx+0x08] mul rdx:rax, r10 xor rax, rdx add r8, rax mov rax, qword ptr [rcx+0x10] mov rdx, qword ptr [rsp+0x10] xor rax, qword ptr [rdx+0x10] mov r10, qword ptr [rcx+0x18] mov qword ptr [rsp+0x10], rdx xor r10, qword ptr [rdx+0x18] mul rdx:rax, r10 xor rax, rdx add r8, rax mov rax, qword ptr [rcx+0x20] mov rdx, qword ptr [rsp+0x10] xor rax, qword ptr [rdx+0x20] mov r10, qword ptr [rcx+0x28] mov qword ptr [rsp+0x10], rdx xor r10, qword ptr [rdx+0x28] mul rdx:rax, r10 xor rax, rdx add r8, rax mov rax, qword ptr [rcx+0x30] mov rdx, qword ptr [rsp+0x10] xor rax, qword ptr [rdx+0x30] mov rcx, qword ptr [rcx+0x38] xor rcx, qword ptr [rdx+0x38] mul rdx:rax, rcx xor rax, rdx add rax, r8 mov rcx, rax shr rcx, 37 xor rcx, rax mov rax, 0x165667919E3779F9 imul rax, rcx mov rcx, rax shr rcx, 32 xor rax, rcx ;; size=162 bbWeight=1 PerfScore 64.25 G_M1045_IG03: ;; offset=0x00A2 ret ;; size=1 bbWeight=1 PerfScore 1.00 ; Total bytes of code: 163

@EgorBo do you have any preference on which approach to take here? (Hardcode rdx as return register or not)

The main advantage with the other approach as i see it is that the code, and generated code for mul and mulx becomes more similar.

I will try and clarify the commen for the swap and fix the conflict in a day or two and would like to know if there is anything else to change.

jakobbotsch · 2025-07-03T09:52:05Z

cc @dotnet/jit-contrib

Daniel-Svensson · 2025-07-03T18:37:55Z

I've opened up #117261 that only contains the first part without mulx support of this PR to ensure that all tests still pass without mulx support.

If you want to review and merge everything at once (this PR) then feel free to close the other PR.

As for mulx I am slightly in favor of making the change #115966 (comment) since it would work more similar to how mul behaves (but it trashes only edx instead of 2 fixed registers).
In the end, either way will make a nice improvement over the current code in main

JulieLeeMSFT · 2025-07-28T13:48:59Z

@EgorBo, PTAL.

Copilot

Pull Request Overview

This PR introduces optimized x86/x64 hardware intrinsics for BigMul operations to improve performance of Math.BigMul methods, particularly for signed multiplication and on platforms without BMI2. The implementation follows the pattern established by the existing DivRem intrinsics.

Key changes:

Adds internal BigMul hardware intrinsics to X86Base for both 32-bit and 64-bit operations
Updates Math.BigMul methods to use the new intrinsics when available on x86/x64 platforms
Implements comprehensive JIT compiler support for code generation, register allocation, and optimization

Reviewed Changes

Copilot reviewed 10 out of 10 changed files in this pull request and generated 8 comments.

Show a summary per file

File	Description
X86Base.cs	Added internal BigMul intrinsic methods for various integer types with appropriate documentation
X86Base.PlatformNotSupported.cs	Added platform not supported stubs for the new BigMul intrinsics
Math.cs	Updated BigMul implementations to use new intrinsics on supported platforms, with conditional compilation for optimization
lsraxarch.cpp	Implemented register allocation logic for BigMul intrinsics with BMI2 and non-BMI2 code paths
lowerxarch.cpp	Added containment check support for BigMul intrinsics
hwintrinsicxarch.cpp	Added import handling for BigMul intrinsics with multi-register return support
hwintrinsiclistxarch.h	Defined BigMul intrinsic metadata and instruction mappings
hwintrinsiccodegenxarch.cpp	Implemented code generation for BigMul with MULX and MUL/IMUL instruction variants
hwintrinsic.h	Added BigMul intrinsics to multi-register return count logic
gentree.cpp	Added layout support for BigMul intrinsic return types

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs

Copilot · 2025-08-04T18:48:26Z

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs

+            /// <remarks>
+            ///   <para>Its functionality is exposed by the public <see cref="Math.BigMul(long, long, out long)" />.</para>
+            /// </remarks>
+            internal static (long Lower, long Upper) BigMul(long left, long right) => BigMul(left, right);


This method has infinite recursion. The method calls itself with the same parameters, which will cause a stack overflow. It should likely call the platform-specific implementation or throw a not implemented exception.

Suggested change

internal static (long Lower, long Upper) BigMul(long left, long right) => BigMul(left, right);

internal static (long Lower, long Upper) BigMul(long left, long right) => throw new PlatformNotSupportedException();

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs

Copilot · 2025-08-04T18:48:28Z

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs

+        /// <remarks>Intented for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>
+        internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) => BigMul(left, right);
+
+        /// <summary>  IMUL reg/m</summary>
+        /// <remarks>Intented for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>


Typo in 'Intented' should be 'Intended'.

Suggested change

/// <remarks>Intented for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>

internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) => BigMul(left, right);

/// <summary> IMUL reg/m</summary>

/// <remarks>Intented for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>

/// <remarks>Intended for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>

internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) => BigMul(left, right);

/// <summary> IMUL reg/m</summary>

/// <remarks>Intended for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>

Daniel-Svensson · 2025-08-04T19:09:33Z

...ies/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.PlatformNotSupported.cs

+        internal static (nuint Lower, nuint Upper) BigMul(nuint left, nuint right) { throw new PlatformNotSupportedException(); }
+
+        /// <summary>  IMUL reg/m</summary>
+        internal static (nint Lower, nint Upper) BigMul(nint left, nint right) { throw new PlatformNotSupportedException(); }


The nint and 32 bit version never gets called, should I remove them or keep them for the symmetry ?

I do no longer think it makes much sense to make this methods public (#58263) since you can just call existing BigMul methods, and the nint version was solved by a simple *if * in #114731

EgorBo · 2025-09-15T06:51:57Z

@Daniel-Svensson sorry for the delayed response, this fell off my radar due to .NET 10.0 release work, could you please resolve the merge conflicts and Copilot's reviews?

github-actions bot added the area-CodeGen-coreclr label May 24, 2025

dotnet-policy-service bot added the community-contribution label May 24, 2025

This was referenced May 24, 2025

Improve Math.BigMul to fix Decimal compare perf regression #115182

Closed

[Perf] Windows/x64: Decimal Regressions on 2/3/2025 6:32:46 PM +00:00 #112432

Closed

This was referenced May 25, 2025

The Operation will be canceled. The next steps may not contain expected logs. dotnet/dnceng#3008

Open

System.Net.Quic.Tests failed with "System.Net.Quic.QuicException : The connection timed out from inactivity." #105177

Closed

Daniel-Svensson marked this pull request as ready for review May 25, 2025 06:59

pentp reviewed May 25, 2025

View reviewed changes

src/libraries/System.Private.CoreLib/src/System/Math.cs Outdated Show resolved Hide resolved

src/libraries/System.Private.CoreLib/src/System/Math.cs Outdated Show resolved Hide resolved

Daniel-Svensson commented May 29, 2025

View reviewed changes

src/coreclr/jit/lsraxarch.cpp Outdated Show resolved Hide resolved

Daniel-Svensson force-pushed the x86_multiply branch from 4ed65b8 to 2f0f838 Compare June 3, 2025 06:29

build-analysis bot mentioned this pull request Jun 3, 2025

System.Net.Http.Functional.Tests timeouts #115683

Closed

Daniel-Svensson commented Jun 4, 2025

View reviewed changes

src/coreclr/jit/lsraxarch.cpp Show resolved Hide resolved

Daniel-Svensson marked this pull request as draft June 16, 2025 21:04

This was referenced Jun 17, 2025

OSX failure on StringTests.StartsWithNoMatch_StringComparison #112195

Open

iOS.Device/Simulator.LibraryMode.Test: failed to determine exit code - RETURN_CODE_NOT_SET #116558

Open

Daniel-Svensson marked this pull request as ready for review June 18, 2025 20:46

Daniel-Svensson force-pushed the x86_multiply branch from eac6624 to bfa9de4 Compare June 18, 2025 21:32

Daniel-Svensson mentioned this pull request Jun 18, 2025

[API Proposal]: Math.MulHigh #68207

Closed

Daniel-Svensson commented Jun 18, 2025

View reviewed changes

This was referenced Jun 19, 2025

Occasional failure in "browser-wasm windows Release LibraryTests: Build Product" #116671

Closed

browser-wasm Windows build error #116746

Closed

Daniel-Svensson added 2 commits July 3, 2025 11:06

Add X86Base.BigMul and call it from Math.BigMul

d781b3a

Emit mulx for X86Base.BigMul BMI2 is supported

7db8372

Daniel-Svensson force-pushed the x86_multiply branch from bfa9de4 to 7db8372 Compare July 3, 2025 09:28

Daniel-Svensson mentioned this pull request Jul 3, 2025

Improve Math.BigMul performance on x64 #117261

Open

build-analysis bot mentioned this pull request Jul 3, 2025

[QUIC & HTTP/3] Handshake Timeout on tests #104426

Closed

This was referenced Jul 3, 2025

System.OperationCanceledException : The operation was canceled. dotnet/dnceng#5278

Closed

browser-wasm windows Debug AllSubsets_CoreCLR builds failing in emcc seemingly unrelated to any code issues #116647

Closed

wasm build failure in CI #117017

Open

JulieLeeMSFT requested a review from EgorBo July 28, 2025 13:49

merge upstream/main

331cc98

Copilot AI review requested due to automatic review settings August 4, 2025 18:46

add comment for swap

614edf6

Copilot AI reviewed Aug 4, 2025

View reviewed changes

Daniel-Svensson commented Aug 4, 2025

View reviewed changes

build-analysis bot mentioned this pull request Aug 4, 2025

[browser] HalfTests.ExplicitConversion_FromSingle failing due to NaN != NaN #103347

Open

JulieLeeMSFT added the needs-author-action label Sep 22, 2025

	internal static (long Lower, long Upper) BigMul(long left, long right) => BigMul(left, right);
	internal static (long Lower, long Upper) BigMul(long left, long right) => throw new PlatformNotSupportedException();

Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base #115966

Are you sure you want to change the base?

Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base #115966

Uh oh!

Conversation

Daniel-Svensson commented May 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Exampels of generated code code

Benchmarks

Additional benchmarks results

Copilot Summary

JIT Compiler Enhancements

Runtime Library Updates

Code Cleanup

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Daniel-Svensson commented Jun 3, 2025

Uh oh!

Daniel-Svensson commented Jun 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Daniel-Svensson commented Jun 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Daniel-Svensson Jun 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

With commit that fix return value in rdx:

Without commit

Without AVX2 (mul only)

Uh oh!

Daniel-Svensson Aug 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jakobbotsch commented Jul 3, 2025

Uh oh!

Daniel-Svensson commented Jul 3, 2025

Uh oh!

JulieLeeMSFT commented Jul 28, 2025

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull Request Overview

Reviewed Changes

Uh oh!

Uh oh!

Copilot AI Aug 4, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Copilot AI Aug 4, 2025

Choose a reason for hiding this comment

Uh oh!

Daniel-Svensson Aug 4, 2025

Choose a reason for hiding this comment

Uh oh!

EgorBo commented Sep 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Improve Math.BigMul on x64 by adding new internal `Multiply` hardware intrinsic to X86Base #115966

Daniel-Svensson commented May 24, 2025 •

edited

Loading

Daniel-Svensson commented Jun 3, 2025 •

edited

Loading

Daniel-Svensson commented Jun 18, 2025 •

edited

Loading

Daniel-Svensson Jun 18, 2025 •

edited

Loading

Daniel-Svensson Aug 4, 2025 •

edited

Loading

EgorBo commented Sep 15, 2025 •

edited

Loading