@@ -304,6 +304,10 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
304304 { ISD::SRA, MVT::v2i64, 1 },
305305 { ISD::SRA, MVT::v4i64, 1 },
306306 { ISD::SRA, MVT::v8i64, 1 },
307+
308+ { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
309+ { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
310+ { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
307311 };
308312
309313 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -370,6 +374,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
370374 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
371375 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
372376 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
377+ { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
378+ { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
379+ { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
380+ { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
381+ { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
382+ { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
383+ { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
384+ { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
373385 };
374386
375387 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
@@ -446,11 +458,32 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
446458 return LT.first * Entry->Cost ;
447459 }
448460
461+ static const CostTblEntry AVX512BWShiftCostTable[] = {
462+ { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
463+ { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
464+ { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
465+
466+ { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
467+ { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
468+ { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
469+
470+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
471+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
472+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
473+ };
474+
475+ if (ST->hasBWI ())
476+ if (const auto *Entry = CostTableLookup (AVX512BWShiftCostTable, ISD, LT.second ))
477+ return LT.first * Entry->Cost ;
478+
449479 static const CostTblEntry AVX2UniformCostTable[] = {
450480 // Uniform splats are cheaper for the following instructions.
451481 { ISD::SHL, MVT::v16i16, 1 }, // psllw.
452482 { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
453483 { ISD::SRA, MVT::v16i16, 1 }, // psraw.
484+ { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
485+ { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
486+ { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
454487 };
455488
456489 if (ST->hasAVX2 () &&
@@ -495,18 +528,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
495528 return LT.first * Entry->Cost ;
496529
497530 static const CostTblEntry AVX512BWCostTable[] = {
498- { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
499- { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
500- { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
501-
502- { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
503- { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
504- { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
505-
506- { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
507- { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
508- { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
509-
510531 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
511532 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
512533 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
@@ -533,6 +554,7 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
533554 { ISD::SRA, MVT::v4i64, 1 },
534555 { ISD::SRA, MVT::v8i64, 1 },
535556
557+ { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
536558 { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
537559 { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
538560 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
@@ -568,6 +590,17 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
568590 { ISD::SRL, MVT::v4i64, 1 },
569591 };
570592
593+ if (ST->hasAVX512 ()) {
594+ if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
595+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
596+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
597+ // On AVX512, a packed v32i16 shift left by a constant build_vector
598+ // is lowered into a vector multiply (vpmullw).
599+ return getArithmeticInstrCost (Instruction::Mul, Ty, Op1Info, Op2Info,
600+ TargetTransformInfo::OP_None,
601+ TargetTransformInfo::OP_None);
602+ }
603+
571604 // Look for AVX2 lowering tricks.
572605 if (ST->hasAVX2 ()) {
573606 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
@@ -667,13 +700,19 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
667700
668701 static const CostTblEntry AVX2CostTable[] = {
669702 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
703+ { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
670704 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
705+ { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
671706
672707 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
708+ { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
673709 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
710+ { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
674711
675712 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
713+ { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
676714 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
715+ { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
677716 { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
678717 { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
679718
@@ -1070,6 +1109,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
10701109 {TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
10711110 {TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
10721111 {TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
1112+ {TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
1113+ {TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
10731114
10741115 {TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
10751116 {TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
@@ -1101,7 +1142,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
11011142 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
11021143 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
11031144 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
1104- {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
1145+ {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 }, // vpermt2d
1146+
1147+ // FIXME: This just applies the type legalization cost rules above
1148+ // assuming these completely split.
1149+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14 },
1150+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14 },
1151+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42 },
1152+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42 },
11051153 };
11061154
11071155 if (ST->hasAVX512 ())
@@ -1358,6 +1406,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
13581406 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
13591407 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7 },// 2*vpmovqd+concat+vpmovdb
13601408
1409+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 9 }, // FIXME
1410+
13611411 // v16i1 -> v16i32 - load + broadcast
13621412 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
13631413 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
@@ -1372,6 +1422,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
13721422 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
13731423 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
13741424
1425+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1426+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1427+
13751428 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
13761429 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
13771430 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
@@ -1843,6 +1896,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
18431896 { ISD::SELECT, MVT::v16i32, 1 },
18441897 { ISD::SELECT, MVT::v8f64, 1 },
18451898 { ISD::SELECT, MVT::v16f32, 1 },
1899+
1900+ { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
1901+ { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
1902+
1903+ { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
1904+ { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
18461905 };
18471906
18481907 static const CostTblEntry AVX2CostTbl[] = {
@@ -2005,12 +2064,20 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
20052064 static const CostTblEntry AVX512CostTbl[] = {
20062065 { ISD::BITREVERSE, MVT::v8i64, 36 },
20072066 { ISD::BITREVERSE, MVT::v16i32, 24 },
2067+ { ISD::BITREVERSE, MVT::v32i16, 10 },
2068+ { ISD::BITREVERSE, MVT::v64i8, 10 },
20082069 { ISD::CTLZ, MVT::v8i64, 29 },
20092070 { ISD::CTLZ, MVT::v16i32, 35 },
2071+ { ISD::CTLZ, MVT::v32i16, 28 },
2072+ { ISD::CTLZ, MVT::v64i8, 18 },
20102073 { ISD::CTPOP, MVT::v8i64, 16 },
20112074 { ISD::CTPOP, MVT::v16i32, 24 },
2075+ { ISD::CTPOP, MVT::v32i16, 18 },
2076+ { ISD::CTPOP, MVT::v64i8, 12 },
20122077 { ISD::CTTZ, MVT::v8i64, 20 },
20132078 { ISD::CTTZ, MVT::v16i32, 28 },
2079+ { ISD::CTTZ, MVT::v32i16, 24 },
2080+ { ISD::CTTZ, MVT::v64i8, 18 },
20142081 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
20152082 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
20162083 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
@@ -2019,6 +2086,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
20192086 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
20202087 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
20212088 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2089+ { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2090+ { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2091+ { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2092+ { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
2093+ { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
2094+ { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
2095+ { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
2096+ { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
20222097 { ISD::FMAXNUM, MVT::f32 , 2 },
20232098 { ISD::FMAXNUM, MVT::v4f32, 2 },
20242099 { ISD::FMAXNUM, MVT::v8f32, 2 },
0 commit comments