Skip to content

Commit

Permalink
[RFC] IR: Support atomicrmw FP ops with vector types (llvm#86796)
Browse files Browse the repository at this point in the history
Allow using atomicrmw fadd, fsub, fmin, and fmax with vectors of
floating-point type. AMDGPU supports atomic fadd for <2 x half> and <2 x
bfloat> on some targets and address spaces.

Note this only supports the proper floating-point operations; float
vector typed xchg is still not supported. cmpxchg still only supports
integers, so this inserts bitcasts for the loop expansion.

I have support for fp vector typed xchg, and vector of int/ptr
separately implemented but I don't have an immediate need for those
beyond feature consistency.
  • Loading branch information
arsenm committed Apr 6, 2024
1 parent bd589f5 commit 4cb110a
Show file tree
Hide file tree
Showing 11 changed files with 1,510 additions and 11 deletions.
11 changes: 6 additions & 5 deletions llvm/docs/LangRef.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11112,11 +11112,12 @@ For most of these operations, the type of '<value>' must be an integer
type whose bit width is a power of two greater than or equal to eight
and less than or equal to a target-specific size limit. For xchg, this
may also be a floating point or a pointer type with the same size constraints
as integers. For fadd/fsub/fmax/fmin, this must be a floating point type. The
type of the '``<pointer>``' operand must be a pointer to that type. If
the ``atomicrmw`` is marked as ``volatile``, then the optimizer is not
allowed to modify the number or order of execution of this
``atomicrmw`` with other :ref:`volatile operations <volatile>`.
as integers. For fadd/fsub/fmax/fmin, this must be a floating-point
or fixed vector of floating-point type. The type of the '``<pointer>``'
operand must be a pointer to that type. If the ``atomicrmw`` is marked
as ``volatile``, then the optimizer is not allowed to modify the
number or order of execution of this ``atomicrmw`` with other
:ref:`volatile operations <volatile>`.

Note: if the alignment is not greater or equal to the size of the `<value>`
type, the atomic operation is likely to require a lock and have poor
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/AsmParser/LLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8240,6 +8240,8 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
return tokError("atomicrmw cannot be unordered");
if (!Ptr->getType()->isPointerTy())
return error(PtrLoc, "atomicrmw operand must be a pointer");
if (Val->getType()->isScalableTy())
return error(ValLoc, "atomicrmw operand may not be scalable");

if (Operation == AtomicRMWInst::Xchg) {
if (!Val->getType()->isIntegerTy() &&
Expand All @@ -8251,7 +8253,7 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
" operand must be an integer, floating point, or pointer type");
}
} else if (IsFP) {
if (!Val->getType()->isFloatingPointTy()) {
if (!Val->getType()->isFPOrFPVectorTy()) {
return error(ValLoc, "atomicrmw " +
AtomicRMWInst::getOperationName(Operation) +
" operand must be a floating point type");
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/CodeGen/AtomicExpandPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,9 +562,9 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Value *&Success, Value *&NewLoaded) {
Type *OrigTy = NewVal->getType();

// This code can go away when cmpxchg supports FP types.
// This code can go away when cmpxchg supports FP and vector types.
assert(!OrigTy->isPointerTy());
bool NeedBitcast = OrigTy->isFloatingPointTy();
bool NeedBitcast = OrigTy->isFloatingPointTy() || OrigTy->isVectorTy();
if (NeedBitcast) {
IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
NewVal = Builder.CreateBitCast(NewVal, IntTy);
Expand Down Expand Up @@ -731,7 +731,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
unsigned ValueSize = DL.getTypeStoreSize(ValueType);

PMV.ValueType = PMV.IntValueType = ValueType;
if (PMV.ValueType->isFloatingPointTy())
if (PMV.ValueType->isFloatingPointTy() || PMV.ValueType->isVectorTy())
PMV.IntValueType =
Type::getIntNTy(Ctx, ValueType->getPrimitiveSizeInBits());

Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4268,9 +4268,10 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
" operand must have integer or floating point type!",
&RMWI, ElTy);
} else if (AtomicRMWInst::isFPOperation(Op)) {
Check(ElTy->isFloatingPointTy(),
Check(ElTy->isFPOrFPVectorTy() && !isa<ScalableVectorType>(ElTy),
"atomicrmw " + AtomicRMWInst::getOperationName(Op) +
" operand must have floating point type!",
" operand must have floating-point or fixed vector of floating-point "
"type!",
&RMWI, ElTy);
} else {
Check(ElTy->isIntegerTy(),
Expand Down
16 changes: 16 additions & 0 deletions llvm/test/Assembler/atomic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,19 @@ define void @fp_atomics(ptr %x) {

ret void
}

define void @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
; CHECK: %atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst
%atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst

; CHECK: %atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst
%atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst

; CHECK: %atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst
%atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst

; CHECK: %atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst
%atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst

ret void
}
41 changes: 41 additions & 0 deletions llvm/test/Assembler/invalid-atomicrmw-scalable.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
; RUN: split-file %s %t --leading-lines
; RUN: not llvm-as < %t/scalable_fp_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR0 %s
; RUN: not llvm-as < %t/scalable_int_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR1 %s
; RUN: not llvm-as < %t/scalable_ptr_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR2 %s
; RUN: not llvm-as < %t/scalable_fp_vector_atomicrmw_fadd.ll 2>&1 | FileCheck -check-prefix=ERR3 %s
; RUN: not llvm-as < %t/scalable_int_vector_atomicrmw_add.ll 2>&1 | FileCheck -check-prefix=ERR4 %s

;--- scalable_fp_vector_atomicrmw_xchg.ll
define <vscale x 2 x half> @scalable_fp_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x half> %val) {
; ERR0: :41: error: atomicrmw operand may not be scalable
%atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x half> %val seq_cst
ret <vscale x 2 x half> %atomic.xchg
}

;--- scalable_int_vector_atomicrmw_xchg.ll
define <vscale x 2 x i16> @scalable_int_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x i16> %val) {
; ERR1: :41: error: atomicrmw operand may not be scalable
%atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x i16> %val seq_cst
ret <vscale x 2 x i16> %atomic.xchg
}

;--- scalable_ptr_vector_atomicrmw_xchg.ll
define <vscale x 2 x ptr> @scalable_ptr_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x ptr> %val) {
; ERR2: :41: error: atomicrmw operand may not be scalable
%atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x ptr> %val seq_cst
ret <vscale x 2 x ptr> %atomic.xchg
}

;--- scalable_fp_vector_atomicrmw_fadd.ll
define <vscale x 2 x half> @scalable_fp_vector_atomicrmw_fadd(ptr %x, <vscale x 2 x half> %val) {
; ERR3: :41: error: atomicrmw operand may not be scalable
%atomic.fadd = atomicrmw fadd ptr %x, <vscale x 2 x half> %val seq_cst
ret <vscale x 2 x half> %atomic.fadd
}

;--- scalable_int_vector_atomicrmw_add.ll
define <vscale x 2 x i16> @scalable_int_vector_atomicrmw_add(ptr %x, <vscale x 2 x i16> %val) {
; ERR4: :39: error: atomicrmw operand may not be scalable
%atomic.add = atomicrmw add ptr %x, <vscale x 2 x i16> %val seq_cst
ret <vscale x 2 x i16> %atomic.add
}
7 changes: 7 additions & 0 deletions llvm/test/Assembler/invalid-atomicrmw-xchg-fp-vector.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s

; CHECK: error: atomicrmw xchg operand must be an integer, floating point, or pointer type
define <2 x half> @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
%atomic.xchg = atomicrmw xchg ptr %x, <2 x half> %val seq_cst
ret <2 x half> %atomic.xchg
}
115 changes: 115 additions & 0 deletions llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,NOLSE %s
; RUN: llc -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,LSE %s

define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
; NOLSE: // %bb.0:
; NOLSE-NEXT: fcvtl v1.4s, v0.4h
; NOLSE-NEXT: ldr s0, [x0]
; NOLSE-NEXT: b .LBB0_2
; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1
; NOLSE-NEXT: fmov s0, w10
; NOLSE-NEXT: cmp w10, w9
; NOLSE-NEXT: b.eq .LBB0_5
; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB0_3 Depth 2
; NOLSE-NEXT: fcvtl v2.4s, v0.4h
; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s
; NOLSE-NEXT: fcvtn v2.4h, v2.4s
; NOLSE-NEXT: fmov w8, s2
; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
; NOLSE-NEXT: ldaxr w10, [x0]
; NOLSE-NEXT: cmp w10, w9
; NOLSE-NEXT: b.ne .LBB0_1
; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2
; NOLSE-NEXT: stlxr wzr, w8, [x0]
; NOLSE-NEXT: cbnz wzr, .LBB0_3
; NOLSE-NEXT: b .LBB0_1
; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
; LSE: // %bb.0:
; LSE-NEXT: fcvtl v1.4s, v0.4h
; LSE-NEXT: ldr s0, [x0]
; LSE-NEXT: .LBB0_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
; LSE-NEXT: fcvtl v2.4s, v0.4h
; LSE-NEXT: fmov w8, s0
; LSE-NEXT: mov w10, w8
; LSE-NEXT: fadd v2.4s, v2.4s, v1.4s
; LSE-NEXT: fcvtn v2.4h, v2.4s
; LSE-NEXT: fmov w9, s2
; LSE-NEXT: casal w10, w9, [x0]
; LSE-NEXT: fmov s0, w10
; LSE-NEXT: cmp w10, w8
; LSE-NEXT: b.ne .LBB0_1
; LSE-NEXT: // %bb.2: // %atomicrmw.end
; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0
; LSE-NEXT: ret
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
ret <2 x half> %res
}

define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
; NOLSE: // %bb.0:
; NOLSE-NEXT: ldr d1, [x0]
; NOLSE-NEXT: b .LBB1_2
; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1
; NOLSE-NEXT: fmov d1, x10
; NOLSE-NEXT: cmp x10, x9
; NOLSE-NEXT: b.eq .LBB1_5
; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB1_3 Depth 2
; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s
; NOLSE-NEXT: fmov x9, d1
; NOLSE-NEXT: fmov x8, d2
; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
; NOLSE-NEXT: ldaxr x10, [x0]
; NOLSE-NEXT: cmp x10, x9
; NOLSE-NEXT: b.ne .LBB1_1
; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2
; NOLSE-NEXT: stlxr wzr, x8, [x0]
; NOLSE-NEXT: cbnz wzr, .LBB1_3
; NOLSE-NEXT: b .LBB1_1
; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end
; NOLSE-NEXT: fmov d0, d1
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
; LSE: // %bb.0:
; LSE-NEXT: ldr d1, [x0]
; LSE-NEXT: .LBB1_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
; LSE-NEXT: fadd v2.2s, v1.2s, v0.2s
; LSE-NEXT: fmov x8, d1
; LSE-NEXT: mov x10, x8
; LSE-NEXT: fmov x9, d2
; LSE-NEXT: casal x10, x9, [x0]
; LSE-NEXT: fmov d1, x10
; LSE-NEXT: cmp x10, x8
; LSE-NEXT: b.ne .LBB1_1
; LSE-NEXT: // %bb.2: // %atomicrmw.end
; LSE-NEXT: fmov d0, d1
; LSE-NEXT: ret
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
ret <2 x float> %res
}

attributes #0 = { nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
84 changes: 84 additions & 0 deletions llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck %s

define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; CHECK-LABEL: test_atomicrmw_fadd_v2f16_align4:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $88, %rsp
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: psrld $16, %xmm0
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: pinsrw $0, 2(%rdi), %xmm1
; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %atomicrmw.start
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: callq __truncsfhf2@PLT
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movzwl %ax, %ebp
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: callq __truncsfhf2@PLT
; CHECK-NEXT: pextrw $0, %xmm0, %ecx
; CHECK-NEXT: shll $16, %ecx
; CHECK-NEXT: orl %ebp, %ecx
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: pextrw $0, %xmm0, %edx
; CHECK-NEXT: shll $16, %edx
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx)
; CHECK-NEXT: setne %cl
; CHECK-NEXT: pinsrw $0, %eax, %xmm0
; CHECK-NEXT: shrl $16, %eax
; CHECK-NEXT: pinsrw $0, %eax, %xmm1
; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.2: # %atomicrmw.end
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: addq $88, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
ret <2 x half> %res
}

define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
; CHECK-LABEL: test_atomicrmw_fadd_v2f32_align8:
; CHECK: # %bb.0:
; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_1: # %atomicrmw.start
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: addps %xmm0, %xmm1
; CHECK-NEXT: movq %xmm1, %rcx
; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi)
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: jne .LBB1_1
; CHECK-NEXT: # %bb.2: # %atomicrmw.end
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
ret <2 x float> %res
}

attributes #0 = { nounwind }
Loading

0 comments on commit 4cb110a

Please sign in to comment.