Skip to content

Latest commit

 

History

History
140 lines (109 loc) · 3.98 KB

320.md

File metadata and controls

140 lines (109 loc) · 3.98 KB
Info

Example

#include <immintrin.h>

int main() {
    const std::vector a = {1, 2, 3, 4};
    const std::vector b = {5, 6, 7, 8};

    const auto va = _mm_loadu_si128((__m128i*)a.data());
    const auto vb = _mm_loadu_si128((__m128i*)b.data());
    const auto result = _mm_add_epi32(va, vb);

    std::vector<int> v(a.size());
    _mm_storeu_si128((__m128i*)v.data(), result);

    assert((std::vector{1 + 5, 2 + 6, 3 + 7, 4 + 8} == v));
}

https://godbolt.org/z/nz1hx76j7

Puzzle

Can you implement a function which computes the dot product of the two arrays using SIMD instructions?

[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs); // TODO

int main() {
    using namespace boost::ut;

    "simd.dot_product empty"_test = [] {
        const std::vector<float> a{};
        const std::vector<float> b{};

        expect(0_i == dot_product(a, b));
    };

    "simd.dot_product one"_test = [] {
        const std::vector a = {1.f};
        const std::vector b = {3.f};

        expect(_i(1*3) == dot_product(a, b));
    };

    "simd.dot_product many"_test = [] {
        const std::vector a = {1.f, 2.f, 3.f, 4.f};
        const std::vector b = {5.f, 6.f, 7.f, 8.f};

        expect(_i(1*5+2*6+3*7+4*8) == dot_product(a, b));
    };
}

https://godbolt.org/z/GnrEh1YM9

Solutions

namespace impl{
    template<typename T>
    struct simd{
        static constexpr auto dot_product(const T& lhs, const T& rhs) -> void
        {
            static_assert(false);
        }
    };
    template<>
    struct simd<std::vector<float>>{
        [[nodiscard]] static constexpr auto dot_product(const std::vector<float>& lhs, const std::vector<float>& rhs) -> float
        {
            std::array<float, 4> out = {0.0f, 0.0f, 0.0f, 0.0f};\
            auto vc = _mm_loadu_ps(out.data());
            for (auto i = 0uz; i < std::size(lhs); i += 4)
            {
                const auto va = _mm_loadu_ps(lhs.data() + i);
                const auto vb = _mm_loadu_ps(rhs.data() + i);
                vc = _mm_fmadd_ps(va, vb, vc);
            }
            _mm_store_ps(out.data(), vc);
            return out.at(0) + out.at(1) + out.at(2) + out.at(3);
        }
    };
}

[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs)
{
    return impl::simd<std::remove_cvref_t<decltype(lhs)>>::dot_product(lhs, rhs);
}
static inline float _mm_reduce_add_ps(__m128 x128) {
    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
    return _mm_cvtss_f32(x32);
}

[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs) {
    auto lhs_vector = _mm_load_ps(lhs.data());
    auto rhs_vector = _mm_load_ps(rhs.data());
    auto output = _mm_fmadd_ps(lhs_vector, rhs_vector, _mm_setzero_ps());
    return _mm_reduce_add_ps(output);
}

https://godbolt.org/z/G8Gjjqhbb

[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs) {
    const __m128 vlhs = _mm_load_ps(lhs.data());
    const __m128 vrhs = _mm_load_ps(rhs.data());
    const __m128 mul = _mm_mul_ps( vlhs, vrhs );
    const __m128 r2 = _mm_add_ps( mul, _mm_movehl_ps( mul, mul ) );
    const __m128 r1 = _mm_add_ss( r2, _mm_movehdup_ps( r2 ) );
    return _mm_cvtss_f32( r1 );
}

https://godbolt.org/z/1EPv1bdfe

[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs){
    const auto va = _mm256_load_ps(lhs.data());
    const auto vb = _mm256_load_ps(rhs.data());
    const auto result = _mm256_dp_ps(va,vb,0xFF);
    return (float)result[0];
}

https://godbolt.org/z/hhYeP4n5P