From 0da0e21c1d87d4165f3b0cb298c055c65c78f7f6 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 29 May 2024 18:09:23 +0800 Subject: [PATCH] [Reassociate] Drop weight reduction to fix issue 91417 (#91469) See the following case: https://alive2.llvm.org/ce/z/A-fBki ``` define i3 @src(i3 %0) { %2 = mul i3 %0, %0 %3 = mul i3 %2, %0 %4 = mul i3 %3, %0 %5 = mul nsw i3 %4, %0 ret i3 %5 } define i3 @tgt(i3 %0) { %2 = mul i3 %0, %0 %5 = mul nsw i3 %2, %0 ret i3 %5 } ``` https://github.com/llvm/llvm-project/commit/d7aeefebd6b049f017711cd7c6ef5f217a17b673 introduced weight reduction during weights combination of the same operand. As the weight of `%0` changes from 5 to 3, the nsw flag in `%5` should be dropped. However, the nsw flag isn't cleared by `RewriteExprTree` since `%5 = mul nsw i3 %0, %4` is not included in the range of `[ExpressionChangedStart, ExpressionChangedEnd)`. ``` Calculated Rank[] = 3 Combine negations for: %2 = mul i3 %0, %0 Calculated Rank[] = 4 Combine negations for: %3 = mul i3 %0, %2 Calculated Rank[] = 5 Combine negations for: %4 = mul i3 %0, %3 Calculated Rank[] = 6 Combine negations for: %5 = mul nsw i3 %0, %4 LINEARIZE: %5 = mul nsw i3 %0, %4 OPERAND: i3 %0 (1) ADD USES LEAF: i3 %0 (1) OPERAND: %4 = mul i3 %0, %3 (1) DIRECT ADD: %4 = mul i3 %0, %3 (1) OPERAND: i3 %0 (1) OPERAND: %3 = mul i3 %0, %2 (1) DIRECT ADD: %3 = mul i3 %0, %2 (1) OPERAND: i3 %0 (1) OPERAND: %2 = mul i3 %0, %0 (1) DIRECT ADD: %2 = mul i3 %0, %0 (1) OPERAND: i3 %0 (1) OPERAND: i3 %0 (1) RAIn: mul i3 [ %0, #3] [ %0, #3] [ %0, #3] RAOut: mul i3 [ %0, #3] [ %0, #3] [ %0, #3] RAOut after CSE reorder: mul i3 [ %0, #3] [ %0, #3] [ %0, #3] RA: %5 = mul nsw i3 %0, %4 TO: %5 = mul nsw i3 %4, %0 RA: %4 = mul i3 %0, %3 TO: %4 = mul i3 %0, %0 ``` The best way to fix this is to inform `RewriteExprTree` to clear flags of the whole expr tree when weight reduction happens. But I find that weight reduction based on Carmichael number never happens in practice. See the coverage result https://dtcxzyw.github.io/llvm-opt-benchmark/coverage/home/dtcxzyw/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp.html#L323 I think it would be better to drop `IncorporateWeight`. Fixes #91417 --- llvm/lib/Transforms/Scalar/Reassociate.cpp | 112 +----------- llvm/test/Transforms/Reassociate/repeats.ll | 187 +++++++++++++------- 2 files changed, 126 insertions(+), 173 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index c903e47a93caf..04c54ed69e93f 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -302,97 +302,6 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { return Res; } -/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael -/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for -/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. -/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every -/// even x in Bitwidth-bit arithmetic. -static unsigned CarmichaelShift(unsigned Bitwidth) { - if (Bitwidth < 3) - return Bitwidth - 1; - return Bitwidth - 2; -} - -/// Add the extra weight 'RHS' to the existing weight 'LHS', -/// reducing the combined weight using any special properties of the operation. -/// The existing weight LHS represents the computation X op X op ... op X where -/// X occurs LHS times. The combined weight represents X op X op ... op X with -/// X occurring LHS + RHS times. If op is "Xor" for example then the combined -/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; -/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. -static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { - // If we were working with infinite precision arithmetic then the combined - // weight would be LHS + RHS. But we are using finite precision arithmetic, - // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct - // for nilpotent operations and addition, but not for idempotent operations - // and multiplication), so it is important to correctly reduce the combined - // weight back into range if wrapping would be wrong. - - // If RHS is zero then the weight didn't change. - if (RHS.isMinValue()) - return; - // If LHS is zero then the combined weight is RHS. - if (LHS.isMinValue()) { - LHS = RHS; - return; - } - // From this point on we know that neither LHS nor RHS is zero. - - if (Instruction::isIdempotent(Opcode)) { - // Idempotent means X op X === X, so any non-zero weight is equivalent to a - // weight of 1. Keeping weights at zero or one also means that wrapping is - // not a problem. - assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); - return; // Return a weight of 1. - } - if (Instruction::isNilpotent(Opcode)) { - // Nilpotent means X op X === 0, so reduce weights modulo 2. - assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); - LHS = 0; // 1 + 1 === 0 modulo 2. - return; - } - if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) { - // TODO: Reduce the weight by exploiting nsw/nuw? - LHS += RHS; - return; - } - - assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) && - "Unknown associative operation!"); - unsigned Bitwidth = LHS.getBitWidth(); - // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth - // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth - // bit number x, since either x is odd in which case x^CM = 1, or x is even in - // which case both x^W and x^(W - CM) are zero. By subtracting off multiples - // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) - // which by a happy accident means that they can always be represented using - // Bitwidth bits. - // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than - // the Carmichael number). - if (Bitwidth > 3) { - /// CM - The value of Carmichael's lambda function. - APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); - // Any weight W >= Threshold can be replaced with W - CM. - APInt Threshold = CM + Bitwidth; - assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); - // For Bitwidth 4 or more the following sum does not overflow. - LHS += RHS; - while (LHS.uge(Threshold)) - LHS -= CM; - } else { - // To avoid problems with overflow do everything the same as above but using - // a larger type. - unsigned CM = 1U << CarmichaelShift(Bitwidth); - unsigned Threshold = CM + Bitwidth; - assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && - "Weights not reduced!"); - unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); - while (Total >= Threshold) - Total -= CM; - LHS = Total; - } -} - using RepeatedValue = std::pair; /// Given an associative binary expression, return the leaf @@ -562,26 +471,7 @@ static bool LinearizeExprTree(Instruction *I, "In leaf map but not visited!"); // Update the number of paths to the leaf. - IncorporateWeight(It->second, Weight, Opcode); - -#if 0 // TODO: Re-enable once PR13021 is fixed. - // The leaf already has one use from inside the expression. As we want - // exactly one such use, drop this new use of the leaf. - assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); - I->setOperand(OpIdx, UndefValue::get(I->getType())); - Changed = true; - - // If the leaf is a binary operation of the right kind and we now see - // that its multiple original uses were in fact all by nodes belonging - // to the expression, then no longer consider it to be a leaf and add - // its operands to the expression. - if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { - LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); - Worklist.push_back(std::make_pair(BO, It->second)); - Leaves.erase(It); - continue; - } -#endif + It->second += Weight; // If we still have uses that are not accounted for by the expression // then it is not safe to modify the value. diff --git a/llvm/test/Transforms/Reassociate/repeats.ll b/llvm/test/Transforms/Reassociate/repeats.ll index c18db19fa73e3..28177f1c0ba5e 100644 --- a/llvm/test/Transforms/Reassociate/repeats.ll +++ b/llvm/test/Transforms/Reassociate/repeats.ll @@ -1,56 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=reassociate -S | FileCheck %s ; Tests involving repeated operations on the same value. define i8 @nilpotent(i8 %x) { -; CHECK-LABEL: @nilpotent( +; CHECK-LABEL: define i8 @nilpotent( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i8 0 +; %tmp = xor i8 %x, %x ret i8 %tmp -; CHECK: ret i8 0 } define i2 @idempotent(i2 %x) { -; CHECK-LABEL: @idempotent( +; CHECK-LABEL: define i2 @idempotent( +; CHECK-SAME: i2 [[X:%.*]]) { +; CHECK-NEXT: ret i2 -1 +; %tmp1 = and i2 %x, %x %tmp2 = and i2 %tmp1, %x %tmp3 = and i2 %tmp2, %x ret i2 %tmp3 -; CHECK: ret i2 %x } define i2 @add(i2 %x) { -; CHECK-LABEL: @add( +; CHECK-LABEL: define i2 @add( +; CHECK-SAME: i2 [[X:%.*]]) { +; CHECK-NEXT: ret i2 0 +; %tmp1 = add i2 %x, %x %tmp2 = add i2 %tmp1, %x %tmp3 = add i2 %tmp2, %x ret i2 %tmp3 -; CHECK: ret i2 0 } define i2 @cst_add() { -; CHECK-LABEL: @cst_add( +; CHECK-LABEL: define i2 @cst_add() { +; CHECK-NEXT: ret i2 -1 +; %tmp1 = add i2 1, 1 %tmp2 = add i2 %tmp1, 1 ret i2 %tmp2 -; CHECK: ret i2 -1 } define i8 @cst_mul() { -; CHECK-LABEL: @cst_mul( +; CHECK-LABEL: define i8 @cst_mul() { +; CHECK-NEXT: ret i8 -13 +; %tmp1 = mul i8 3, 3 %tmp2 = mul i8 %tmp1, 3 %tmp3 = mul i8 %tmp2, 3 %tmp4 = mul i8 %tmp3, 3 ret i8 %tmp4 -; CHECK: ret i8 -13 } define i3 @foo3x5(i3 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo3x5( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i3 @foo3x5( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i3 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i3 [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret i3 [[TMP5]] +; %tmp1 = mul i3 %x, %x %tmp2 = mul i3 %tmp1, %x %tmp3 = mul i3 %tmp2, %x @@ -58,12 +70,31 @@ define i3 @foo3x5(i3 %x) { ret i3 %tmp4 } +define i3 @foo3x5_nsw(i3 %x) { +; Can be done with two multiplies. +; CHECK-LABEL: define i3 @foo3x5_nsw( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i3 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i3 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i3 [[TMP4]] +; + %tmp1 = mul i3 %x, %x + %tmp2 = mul i3 %tmp1, %x + %tmp3 = mul i3 %tmp2, %x + %tmp4 = mul nsw i3 %tmp3, %x + ret i3 %tmp4 +} + define i3 @foo3x6(i3 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo3x6( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i3 @foo3x6( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i3 [[TMP3]], [[TMP3]] +; CHECK-NEXT: ret i3 [[TMP2]] +; %tmp1 = mul i3 %x, %x %tmp2 = mul i3 %tmp1, %x %tmp3 = mul i3 %tmp2, %x @@ -74,10 +105,14 @@ define i3 @foo3x6(i3 %x) { define i3 @foo3x7(i3 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo3x7( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i3 @foo3x7( +; CHECK-SAME: i3 [[X:%.*]]) { +; CHECK-NEXT: [[TMP5:%.*]] = mul i3 [[X]], [[X]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i3 [[TMP5]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i3 [[TMP7]], [[X]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i3 [[TMP3]], [[TMP7]] +; CHECK-NEXT: ret i3 [[TMP6]] +; %tmp1 = mul i3 %x, %x %tmp2 = mul i3 %tmp1, %x %tmp3 = mul i3 %tmp2, %x @@ -89,10 +124,13 @@ define i3 @foo3x7(i3 %x) { define i4 @foo4x8(i4 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo4x8( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x8( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP3]], [[TMP3]] +; CHECK-NEXT: ret i4 [[TMP4]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -105,11 +143,14 @@ define i4 @foo4x8(i4 %x) { define i4 @foo4x9(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x9( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x9( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i4 [[TMP3]], [[TMP2]] +; CHECK-NEXT: ret i4 [[TMP8]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -123,11 +164,14 @@ define i4 @foo4x9(i4 %x) { define i4 @foo4x10(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x10( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x10( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]] +; CHECK-NEXT: ret i4 [[TMP3]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -142,12 +186,15 @@ define i4 @foo4x10(i4 %x) { define i4 @foo4x11(i4 %x) { ; Can be done with four multiplies. -; CHECK-LABEL: @foo4x11( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x11( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i4 [[TMP3]], [[TMP2]] +; CHECK-NEXT: ret i4 [[TMP10]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -163,10 +210,14 @@ define i4 @foo4x11(i4 %x) { define i4 @foo4x12(i4 %x) { ; Can be done with two multiplies. -; CHECK-LABEL: @foo4x12( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x12( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP3]], [[TMP3]] +; CHECK-NEXT: ret i4 [[TMP2]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -183,11 +234,15 @@ define i4 @foo4x12(i4 %x) { define i4 @foo4x13(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x13( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x13( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i4 [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret i4 [[TMP12]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -205,11 +260,15 @@ define i4 @foo4x13(i4 %x) { define i4 @foo4x14(i4 %x) { ; Can be done with three multiplies. -; CHECK-LABEL: @foo4x14( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x14( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i4 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i4 [[TMP5]], [[X]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i4 [[TMP6]], [[TMP6]] +; CHECK-NEXT: ret i4 [[TMP7]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x @@ -228,12 +287,16 @@ define i4 @foo4x14(i4 %x) { define i4 @foo4x15(i4 %x) { ; Can be done with four multiplies. -; CHECK-LABEL: @foo4x15( -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: mul -; CHECK-NEXT: ret +; CHECK-LABEL: define i4 @foo4x15( +; CHECK-SAME: i4 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = mul i4 [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i4 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i4 [[TMP6]], [[X]] +; CHECK-NEXT: [[TMP14:%.*]] = mul i4 [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret i4 [[TMP14]] +; %tmp1 = mul i4 %x, %x %tmp2 = mul i4 %tmp1, %x %tmp3 = mul i4 %tmp2, %x