Skip to content

Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base #115966

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

Daniel-Svensson
Copy link
Contributor

@Daniel-Svensson Daniel-Svensson commented May 24, 2025

The biggest improvements are signed long and for platforms without BMI2.
A nice side effect is that the ready2run code can now emit a simple mul instead of having to fallback to the 32bit code.

This pull request introduces a internal Multiply hardware intrinsics (NI_X86Base_Multiply and NI_X86Base_X64_Multiply) for x86 and x64 architectures in the JIT compiler and calls them from Math.BigMul

This improves the machine code for signed BigMul which should fix #75594 based on the API shape suggested in #58263
It can also help with implementing IntPtr.BigMul #114731

NOTES:

  • The code is heavily based on the DivRem code introduced in Implement DivRem intrinsic for X86 #66551 (I went through the current version of all the files touched and tried to add similar code for multiply).

  • I did not do Mono so I did try to use conditional compilation to exclude it from Mono (since it does not seem as straightforward and I do not know how to test the various combinations). Also it seems like it might already has special cases for bigmul

  • I have not tuched the jit compiler before, so while the code executes and seems to work fine i might have missed something.

  • Since it uses tuples it has some of the downsides of DivRem (especially on windows) where extra temp variables and stackspill, so there might be a few scenarios where performance is slighly worse or the same. (There was some discussion in Consume DivRem intrinsics from Math.DivRem #82194 )

  • There might be other better solutions including special handing Math.BigMul, but that would probably to many new changes to the JIT for me to take on

Exampels of generated code code

[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
static void TestBigMul2(ref ulong x, ref ulong y)
{
    x = Math.BigMul(x, y, out y);
}

[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
static void TestBigMul1(ref long x, ref long y)
{
    x = Math.BigMul(x, y, out y);
}

Produces the following

; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M36427_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M36427_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       mul      rdx:rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=22 bbWeight=1 PerfScore 12.00

G_M36427_IG03:  ;; offset=0x0016
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 23



; Method Program:<<Main>$>g__TestBigMul1|0_2(byref,byref) (FullOpts)
G_M20175_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M20175_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       imul     rdx:rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=22 bbWeight=1 PerfScore 12.00

G_M20175_IG03:  ;; offset=0x0016
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 23

WIth BMI2 (mulx)

; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M36427_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M36427_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x10], rdx
       mov      rdx, rax
       mulx     rdx, rax, qword ptr [rdx]
       mov      r8, bword ptr [rsp+0x10]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx], rdx
						;; size=27 bbWeight=1 PerfScore 12.25

G_M36427_IG03:  ;; offset=0x001B
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 28
code before
; Method Program:<<Main>$>g__TestBigMul2|0_5(byref,byref) (FullOpts)
G_M000_IG01:                ;; offset=0x0000
       push     rax
       mov      bword ptr [rsp+0x18], rdx

G_M000_IG02:                ;; offset=0x0006
       mov      rdx, qword ptr [rcx]
       mov      rax, bword ptr [rsp+0x18]
       mov      r8, qword ptr [rax]
       lea      r10, [rsp]
       mulx     rdx, r9, r8
       mov      qword ptr [r10], r9
       mov      r8, qword ptr [rsp]
       mov      qword ptr [rax], r8
       mov      qword ptr [rcx], rdx

G_M000_IG03:                ;; offset=0x0027
       add      rsp, 8
       ret      
; Total bytes of code: 44

; Assembly listing for method Program:<<Main>$>g__TestBigMul1|0_6(byref,byref) (FullOpts)
G_M000_IG01:                ;; offset=0x0000
       push     rax
 
G_M000_IG02:                ;; offset=0x0001
       mov      rax, qword ptr [rcx]
       mov      bword ptr [rsp+0x18], rdx
       mov      r8, qword ptr [rdx]
       lea      r10, [rsp]
       mov      rdx, rax
       mulx     rdx, r9, r8
       mov      qword ptr [r10], r9
       mov      r10, qword ptr [rsp]
       mov      r9, bword ptr [rsp+0x18]
       mov      qword ptr [r9], r10
       mov      r10, rax
       sar      r10, 63
       and      r10, r8
       sub      rdx, r10
       sar      r8, 63
       and      rax, r8
       sub      rdx, rax
       mov      qword ptr [rcx], rdx
 
G_M000_IG03:                ;; offset=0x0041
       add      rsp, 8
       ret      
 
; Total bytes of code 70
Further code samples with array access
static long TestBigMulArr2(long[] x, ref long y)
{
    return Math.BigMul(y, x[1], out y);
}

				
static void TestBigMulArr12(long[] x, ref long y)
{
    x[1] = Math.BigMul(y, x[1], out y);
}
		
; Method Program:<<Main>$>g__TestBigMulArr2|0_9(long[],byref):long (FullOpts)
G_M31792_IG01:  ;; offset=0x0000
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25

G_M31792_IG02:  ;; offset=0x0004
       mov      bword ptr [rsp+0x38], rdx
       mov      rax, qword ptr [rdx]
       cmp      dword ptr [rcx+0x08], 1
       jbe      SHORT G_M31792_IG04
       imul     rdx:rax, qword ptr [rcx+0x18]
       mov      rcx, bword ptr [rsp+0x38]
       mov      qword ptr [rcx], rax
       mov      rax, rdx
						;; size=29 bbWeight=1 PerfScore 15.25

G_M31792_IG03:  ;; offset=0x0021
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25

G_M31792_IG04:  ;; offset=0x0026
       call     CORINFO_HELP_RNGCHKFAIL
       int3     
						;; size=6 bbWeight=0 PerfScore 0.00
; Total bytes of code: 44


; Method Program:<<Main>$>g__TestBigMulArr12|0_10(long[],byref) (FullOpts)
G_M43177_IG01:  ;; offset=0x0000
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25

G_M43177_IG02:  ;; offset=0x0004
       mov      bword ptr [rsp+0x38], rdx
       mov      rax, qword ptr [rdx]
       mov      r8d, dword ptr [rcx+0x08]
       cmp      r8d, 1
       jbe      SHORT G_M43177_IG04
       imul     rdx:rax, qword ptr [rcx+0x18]
       mov      r8, bword ptr [rsp+0x38]
       mov      qword ptr [r8], rax
       mov      qword ptr [rcx+0x18], rdx
						;; size=34 bbWeight=1 PerfScore 15.25

G_M43177_IG03:  ;; offset=0x0026
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25

G_M43177_IG04:  ;; offset=0x002B
       call     CORINFO_HELP_RNGCHKFAIL
       int3     
						;; size=6 bbWeight=0 PerfScore 0.00
; Total bytes of code: 49

Benchmarks

The Full Benchmark code is found here

The benchmarks are based on a becnhmark suggested for MultplyNoFlags below does the following

        [Benchmark]
        public ulong BenchBigMulUnsigned()
        {
            ulong accLo = TestA;
            ulong accHi = TestB;
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            MathBigMulAcc(accLo, accHi, ref accHi, ref accLo);
            return accLo + accHi;
        }
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private unsafe void MathBigMulAcc(ulong a, ulong b, ref ulong accHi, ref ulong accLo)
        {
            ulong lo;
            ulong hi = Math.BigMul(a, b, out lo);
            accHi += hi;
            accLo += lo;
        }
Gnerated code with DOTNET_EnableBMI2=1
; Method Benchmarks.Scenarios.BigMulTests:BenchBigMulUnsigned():ulong:this (FullOpts)
G_M56495_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M56495_IG02:  ;; offset=0x0000
       mov      rdx, qword ptr [rcx+0x08]
       mov      rax, qword ptr [rcx+0x10]
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rax, r8
       add      rdx, rcx
       mulx     r8, rcx, rax
       add      rcx, rdx
       add      rax, rcx
       add      rax, r8
						;; size=55 bbWeight=1 PerfScore 18.25

G_M56495_IG03:  ;; offset=0x0037
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 56
Generated code with DOTNET_EnableAVX2=0

A single push to the stack and several reads/writes since rax is spilled.

; Method Benchmarks.Scenarios.BigMulTests:BenchBigMulUnsigned():ulong:this (FullOpts)
G_M56495_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M56495_IG02:  ;; offset=0x0000
       mov      r8, qword ptr [rcx+0x08]
       mov      rcx, qword ptr [rcx+0x10]
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rcx, rdx
       add      r8, rax
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rcx, rdx
       add      r8, rax
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rcx, rdx
       add      r8, rax
       mov      rax, r8
       mul      rdx:rax, rcx
       add      rax, r8
       add      rax, rcx
       add      rax, rdx
						;; size=59 bbWeight=1 PerfScore 19.25

G_M56495_IG03:  ;; offset=0x003B
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 60
Baseline: Calling old MultiplyNoFlags
; Method Benchmarks.Scenarios.BigMulTests:BenchMultiplyNoFlags3Ards():ulong:this (FullOpts)
G_M20411_IG01:  ;; offset=0x0000
       sub      rsp, 40
       xor      eax, eax
       mov      qword ptr [rsp+0x08], rax
       vxorps   xmm4, xmm4, xmm4
       vmovdqa  xmmword ptr [rsp+0x10], xmm4
       mov      qword ptr [rsp+0x20], rax
						;; size=26 bbWeight=1 PerfScore 4.83

G_M20411_IG02:  ;; offset=0x001A
       mov      rdx, qword ptr [rcx+0x08]
       mov      rax, qword ptr [rcx+0x10]
       lea      rcx, [rsp+0x20]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x20]
       lea      rcx, [rsp+0x18]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x18]
       lea      rcx, [rsp+0x10]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       add      rax, r10
       add      rdx, qword ptr [rsp+0x10]
       lea      rcx, [rsp+0x08]
       mulx     r10, r8, rax
       mov      qword ptr [rcx], r8
       mov      rcx, rdx
       add      rcx, qword ptr [rsp+0x08]
       add      rax, rcx
       add      rax, r10
						;; size=98 bbWeight=1 PerfScore 31.50

G_M20411_IG03:  ;; offset=0x007C
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25
; Total bytes of code: 129
Results for Math.Bigmul with BMI2

BenchmarkDotNet v0.14.0, Windows 11 (10.0.26100.4202)
AMD Ryzen 7 5800X, 1 CPU, 16 logical and 8 physical cores
.NET SDK 10.0.100-preview.3.25201.16
  [Host]     : .NET 10.0.0 (10.0.25.17105), X64 RyuJIT AVX2
  Job-ZAWDHL : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX2
  Job-RAEHQD : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX2

Method Job Toolchain TestA TestB Mean Error StdDev Ratio
BenchBigMulUnsigned Job-ZAWDHL \net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 0.6420 ns 0.0065 ns 0.0061 ns 0.42
BenchBigMulUnsigned Job-RAEHQD \net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 1.5354 ns 0.0141 ns 0.0125 ns 1.00
BenchBigMulSigned Job-ZAWDHL \net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 1.2853 ns 0.0070 ns 0.0065 ns 0.43
BenchBigMulSigned Job-RAEHQD \net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 2.9852 ns 0.0263 ns 0.0246 ns 1.00
BenchMultiplyNoFlags3Ards Job-ZAWDHL \net10.0-windows-Release-x64\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 3.3122 ns 0.0060 ns 0.0056 ns 1.00
BenchMultiplyNoFlags3Ards Job-RAEHQD \net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 3.3080 ns 0.0058 ns 0.0054 ns 1.00

UPDATED measurements

Method Job Toolchain TestA TestB Mean Error StdDev Ratio RatioSD
BenchBigMulUnsigned Job-QMWUGV CoreRun 81985529216486895 16045690984833335023 0.8473 ns 0.0074 ns 0.0069 ns 1.00 0.01
BenchBigMulSigned Job-QMWUGV CoreRun 81985529216486895 16045690984833335023 0.8525 ns 0.0107 ns 0.0095 ns 1.00 0.02
Hardware without BMI2, "~10 times faster"
Method Job Toolchain TestA TestB Mean Error StdDev Ratio RatioSD
BenchBigMulUnsigned Job-JCYSGS \net10.0-windows-Release-x64_MathInstrinct\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 1.283 ns 0.0104 ns 0.0092 ns 0.10 0.00
BenchBigMulUnsigned Job-SJSTKO \net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 12.256 ns 0.0133 ns 0.0118 ns 1.00 0.00
BenchBigMulSigned Job-JCYSGS \net10.0-windows-Release-x64_MathInstrinct\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 1.275 ns 0.0051 ns 0.0048 ns 0.12 0.00
BenchBigMulSigned Job-SJSTKO \net10.0-windows-Release-x64_main\shared\Microsoft.NETCore.App\10.0.0\corerun.exe 81985529216486895 16045690984833335023 10.783 ns 0.0743 ns 0.0620 ns 1.00 0.00

Additional benchmarks results

Additional resutls can be found under https://github.com/Daniel-Svensson/ClrExperiments/tree/7acd61943336356fa363763914a5b963de962065/ClrDecimal/Benchmarks/BenchmarkDotNet.Artifacts/results , I mostly checked that there was no significant regressions to decimal performance since Math.BigMul is has several usages there. There were a few minor improvements, mostly in the composite "InterestBenchmarks" which contains a mix of operations similar to interest calculation.

Copilot Summary

Summary

JIT Compiler Enhancements

  • Added support for Multiply intrinsics in the JIT compiler, including updates to ContainCheckHWIntrinsic, BuildHWIntrinsic, and impSpecialIntrinsic to handle the new instructions and their constraints (src/coreclr/jit/lowerxarch.cpp, src/coreclr/jit/lsraxarch.cpp, src/coreclr/jit/hwintrinsicxarch.cpp). [1] [2] [3]
  • Updated HWIntrinsicInfo and GenTreeHWIntrinsic to include the Multiply intrinsics and their associated properties (src/coreclr/jit/hwintrinsic.h, src/coreclr/jit/gentree.cpp). [1] [2]
  • Extended hwintrinsiclistxarch.h to define the Multiply intrinsics and their characteristics, such as instruction mapping and flags (src/coreclr/jit/hwintrinsiclistxarch.h). [1] [2]

Runtime Library Updates

  • Introduced X86Base.Multiply methods for both signed and unsigned multiplication in the runtime intrinsics API, providing platform-specific implementations or fallback behavior (src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs, src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.PlatformNotSupported.cs). [1] [2]
  • Updated the Math class to use the new Multiply intrinsics for optimized BigMul operations, improving performance on supported platforms (src/libraries/System.Private.CoreLib/src/System/Math.cs). [1] [2]

Code Cleanup

  • Removed outdated and unused code paths related to older multiplication implementations in the Math class (src/libraries/System.Private.CoreLib/src/System/Math.cs). [1] [2]

These changes collectively enhance the performance and capabilities of multiplication operations in .NET, leveraging hardware acceleration where available.Summary:

@Daniel-Svensson
Copy link
Contributor Author

Due to conflict with new changes in main I had to rename the X86 method to BigMul

@Daniel-Svensson
Copy link
Contributor Author

Daniel-Svensson commented Jun 3, 2025

I've decided to push the bmi2 / mulx support and updated the "BigMul" test results.
There are also some nice improvement to XXHash from this commit, but it might as well be from updating main ?

However i can remove the mulx part to to a follow up PR make review/testing easier.
The generated code without mulx is often smaller and , the additional register usage is not as big deal for x64 as it is for 32bit

@Daniel-Svensson
Copy link
Contributor Author

Daniel-Svensson commented Jun 18, 2025

@jakobbotsch it seems you was not notified about this PR either (it was made before mulx for GT_MULHI, #116198 ) which you just reviewed. I've made some minor changes based on that feedback

{
isRMW = false;

SingleTypeRegSet apxAwareRegCandidates =
Copy link
Contributor Author

@Daniel-Svensson Daniel-Svensson Jun 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: I have a separate commit update at that adds EDX as fixed register for return value instead of operand.

It seems to give slightly better PerfScore for XxHashShared:MergeAccumulators, but seems a bit backwards to specify target register instead of source. Also the code difference might be fixed by future improvements to register allocator instead.

How do you feel about it? i am not sure if it is better or not

generated assembly

With commit that fix return value in rdx:

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M1045_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       xor      rax, qword ptr [rdx]
       mov      r10, qword ptr [rcx+0x08]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x08]
       mov      rdx, r10
       mulx     rax, rdx, rax
       xor      rax, rdx
       add      rax, r8
       mov      rdx, qword ptr [rcx+0x10]
       mov      r8, qword ptr [rsp+0x10]
       xor      rdx, qword ptr [r8+0x10]
       mov      r10, qword ptr [rcx+0x18]
       xor      r10, qword ptr [r8+0x18]
       mulx     r10, rdx, r10
       xor      r10, rdx
       add      rax, r10
       mov      rdx, qword ptr [rcx+0x20]
       xor      rdx, qword ptr [r8+0x20]
       mov      r10, qword ptr [rcx+0x28]
       xor      r10, qword ptr [r8+0x28]
       mulx     r10, rdx, r10
       xor      r10, rdx
       add      rax, r10
       mov      rdx, qword ptr [rcx+0x30]
       xor      rdx, qword ptr [r8+0x30]
       mov      rcx, qword ptr [rcx+0x38]
       xor      rcx, qword ptr [r8+0x38]
       mulx     rcx, rdx, rcx
       xor      rcx, rdx
       add      rax, rcx
       mov      rcx, rax
       shr      rcx, 37
       xor      rcx, rax
       mov      rax, 0x165667919E3779F9
       imul     rax, rcx
       mov      rcx, rax
       shr      rcx, 32
       xor      rax, rcx
						;; size=153 bbWeight=1 PerfScore 60.50

G_M1045_IG03:  ;; offset=0x0099
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 154

Without commit

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M1045_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       xor      rax, qword ptr [rdx]
       mov      r10, qword ptr [rcx+0x08]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x08]
       mov      rdx, r10
       mulx     r10, rax, rax
       xor      rax, r10
       add      rax, r8
       mov      r8, qword ptr [rcx+0x10]
       mov      rdx, qword ptr [rsp+0x10]
       xor      r8, qword ptr [rdx+0x10]
       mov      r10, qword ptr [rcx+0x18]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x18]
       mov      rdx, r10
       mulx     r10, r8, r8
       xor      r8, r10
       add      rax, r8
       mov      r8, qword ptr [rcx+0x20]
       mov      rdx, qword ptr [rsp+0x10]
       xor      r8, qword ptr [rdx+0x20]
       mov      r10, qword ptr [rcx+0x28]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x28]
       mov      rdx, r10
       mulx     r10, r8, r8
       xor      r8, r10
       add      rax, r8
       mov      r8, qword ptr [rcx+0x30]
       mov      rdx, qword ptr [rsp+0x10]
       xor      r8, qword ptr [rdx+0x30]
       mov      rcx, qword ptr [rcx+0x38]
       xor      rcx, qword ptr [rdx+0x38]
       mov      rdx, rcx
       mulx     rdx, rcx, r8
       xor      rcx, rdx
       add      rax, rcx
       mov      rcx, rax
       shr      rcx, 37
       xor      rcx, rax
       mov      rax, 0x165667919E3779F9
       imul     rax, rcx
       mov      rcx, rax
       shr      rcx, 32
       xor      rax, rcx
						;; size=182 bbWeight=1 PerfScore 65.25

G_M1045_IG03:  ;; offset=0x00B6
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 183

Without AVX2 (mul only)

; Method BigMul.XxHashShared:MergeAccumulators(ptr,ptr,ulong):ulong (FullOpts)
G_M1045_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M1045_IG02:  ;; offset=0x0000
       mov      rax, qword ptr [rcx]
       xor      rax, qword ptr [rdx]
       mov      r10, qword ptr [rcx+0x08]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x08]
       mul      rdx:rax, r10
       xor      rax, rdx
       add      r8, rax
       mov      rax, qword ptr [rcx+0x10]
       mov      rdx, qword ptr [rsp+0x10]
       xor      rax, qword ptr [rdx+0x10]
       mov      r10, qword ptr [rcx+0x18]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x18]
       mul      rdx:rax, r10
       xor      rax, rdx
       add      r8, rax
       mov      rax, qword ptr [rcx+0x20]
       mov      rdx, qword ptr [rsp+0x10]
       xor      rax, qword ptr [rdx+0x20]
       mov      r10, qword ptr [rcx+0x28]
       mov      qword ptr [rsp+0x10], rdx
       xor      r10, qword ptr [rdx+0x28]
       mul      rdx:rax, r10
       xor      rax, rdx
       add      r8, rax
       mov      rax, qword ptr [rcx+0x30]
       mov      rdx, qword ptr [rsp+0x10]
       xor      rax, qword ptr [rdx+0x30]
       mov      rcx, qword ptr [rcx+0x38]
       xor      rcx, qword ptr [rdx+0x38]
       mul      rdx:rax, rcx
       xor      rax, rdx
       add      rax, r8
       mov      rcx, rax
       shr      rcx, 37
       xor      rcx, rax
       mov      rax, 0x165667919E3779F9
       imul     rax, rcx
       mov      rcx, rax
       shr      rcx, 32
       xor      rax, rcx
						;; size=162 bbWeight=1 PerfScore 64.25

G_M1045_IG03:  ;; offset=0x00A2
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 163

@jakobbotsch
Copy link
Member

cc @dotnet/jit-contrib

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI community-contribution Indicates that the PR has been added by a community member
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Suboptimal x64 codegen for signed Math.BigMul
4 participants