How to generate more efficient SIMD code #2632

zengdelang · 2023-08-22T14:16:17Z

zengdelang
Aug 22, 2023

The following code simulates _mm256_movemask_ps, although it can run correctly, the execution overhead is higher than _mm256_movemask_ps, is there a more efficient way to write the code that can simulate the performance of _mm256_movemask_ps?For specific time-consuming comparisons, please check the code library repository ：https://github.com/zengdelang/ISPC_Sort

inline uniform int Partition_Vec(uniform Sort8Type CurVec[], uniform Sort8Type PivotVec[], uniform Sort8Type SmallestVec[], uniform Sort8Type BiggestVec[])
{
    /* update the smallest and largest values of the array */
    foreach (i = 0 ... 8)
    {
        SmallestVec[i] = min(SmallestVec[i], CurVec[i]);
    }

    foreach (i = 0 ... 8)
    {
        BiggestVec[i] = max(BiggestVec[i], CurVec[i]);
    }

    /* which elements are larger than the pivot */
    // __m256i compared = _mm256_cmpgt_epi32(Curr_Vec, pivot_vec);
    uniform uint32 Compared[8];
    foreach (i = 0 ... 8)
    {
        Compared[i] = sign_extend(CurVec[i] > PivotVec[i]);
    }

    /* extract the most significant bit from each integer of the vector */
    // int mm = _mm256_movemask_ps(_mm256_castsi256_ps(compared));
    uniform uint32 ComparedMask[8] = {0b1, 0b10, 0b100, 0b1000, 0b10000, 0b100000, 0b1000000, 0b10000000};
    foreach (i = 0 ... 8)
    {
        Compared[i] = Compared[i] & ComparedMask[i];
    }

    int32 Sum = 0;
    foreach (i = 0 ... 8)
    {
        Sum += Compared[i];
    }
    uniform int32 MoveMask = reduce_add(Sum);

    /* how many ones, each 1 stands for an element greater than pivot */
    // int Amount_GT_Pivot = _mm_popcnt_u32((mm));
    uniform int32 Amount_GT_Pivot = popcnt(MoveMask);

    Shuffle8_One_Vector_To_Vector_(CurVec, CurVec, Permutation_Masks[MoveMask]);

    return Amount_GT_Pivot;
}

Answered by nurmukhametov

Aug 22, 2023

/* which elements are larger than the pivot /
// __m256i compared = _mm256_cmpgt_epi32(Curr_Vec, pivot_vec);
uniform uint32 Compared[8];
...
/ extract the most significant bit from each integer of the vector */
// int mm = _mm256_movemask_ps(_mm256_castsi256_ps(compared));
uniform uint32 ComparedMask[8] = {0b1, 0b10, 0b100, 0b1000, 0b10000, 0b100000, 0b1000000, 0b10000000};
....
}

I am not sure I follow the details but the quoted part looks like that it can be expressed using packmask like this:

export uniform int foo(uniform uint8 a[], uniform uint8 p[]) {
    return packmask(a[programIndex] > p[programIndex]);
}

compiling with the command:

$ ispc -O2 --target=avx2-i8x32 example.ispc …

View full answer

zengdelang · 2023-08-22T14:33:46Z

zengdelang
Aug 22, 2023
Author

Another question, I need to simulate the _mm512_mask_compressstoreu_pd instruction on the avx512 instruction set to implement the partition function of quick sort, can the following simulation code be compiled into the _mm512_mask_compressstoreu_pd instruction?

// _mm512_mask_compressstoreu_pd
uniform void Simulate_Compressstoreu_AVX512() 
{
    uniform int Indices[16];
    uniform int Mask[16] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
    uniform int Values[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
    
    foreach (i = 0 ... 16) 
    {
        if (Mask[i] > 0)
        {
            packed_store_active(&Indices[0], Values[i]);
        }
    }
}

0 replies

nurmukhametov · 2023-08-22T15:21:11Z

nurmukhametov
Aug 22, 2023
Maintainer

/* which elements are larger than the pivot /
// __m256i compared = _mm256_cmpgt_epi32(Curr_Vec, pivot_vec);
uniform uint32 Compared[8];
...
/ extract the most significant bit from each integer of the vector */
// int mm = _mm256_movemask_ps(_mm256_castsi256_ps(compared));
uniform uint32 ComparedMask[8] = {0b1, 0b10, 0b100, 0b1000, 0b10000, 0b100000, 0b1000000, 0b10000000};
....
}

I am not sure I follow the details but the quoted part looks like that it can be expressed using packmask like this:

export uniform int foo(uniform uint8 a[], uniform uint8 p[]) {
    return packmask(a[programIndex] > p[programIndex]);
}

compiling with the command:

$ ispc -O2 --target=avx2-i8x32 example.ispc -o example.o

the following code generated :

foo:                                    # @foo
        vmovdqu ymm0, ymmword ptr [rdi]
        vpminub ymm1, ymm0, ymmword ptr [rsi]
        vpcmpeqb        ymm0, ymm0, ymm1
        vpmovmskb       eax, ymm0
        not     eax
        vzeroupper
        ret

1 reply

zengdelang Aug 22, 2023
Author

@nurmukhametov It works. Thanks a lot

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to generate more efficient SIMD code #2632

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 2 comments 1 reply

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

Select a reply

How to generate more efficient SIMD code #2632

zengdelang Aug 22, 2023

Replies: 2 comments · 1 reply

zengdelang Aug 22, 2023 Author

nurmukhametov Aug 22, 2023 Maintainer

zengdelang Aug 22, 2023 Author

zengdelang
Aug 22, 2023

Replies: 2 comments 1 reply

zengdelang
Aug 22, 2023
Author

nurmukhametov
Aug 22, 2023
Maintainer

zengdelang Aug 22, 2023
Author