Merge pull request #920 from xtensor-stack/test/doc

Setup testing of documentation example
xtensor-stack · Apr 19, 2023 · e3c6a7e · e3c6a7e
2 parents 24121b5 + 99096af
commit e3c6a7e
Show file tree

Hide file tree

Showing 17 changed files with 223 additions and 195 deletions.
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
@@ -19,8 +19,8 @@ jobs:
         mkdir _build
         cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
     - name: Build
-      run: cmake --build _build
+      run: cmake --build _build --verbose
     - name: Testing sequential
-      run: cmake --build _build --target xbenchmark
+      run: cmake --build _build --target xbenchmark --verbose
     - name: Testing xsimd
       run: ${{github.workspace}}/_build/test/test_xsimd
diff --git a/docs/source/api/dispatching.rst b/docs/source/api/dispatching.rst
@@ -58,49 +58,12 @@ specific details follow.
 The ``sum.hpp`` header contains the function being actually called, in an
 architecture-agnostic description:
 
-.. code-block:: c++
+.. literalinclude:: ../../../test/doc/sum.hpp
 
-    #ifndef _SUM_HPP
-    #define _SUM_HPP
-
-    // functor with a call method that depends on `Arch`
-    struct sum {
-      // It's critical not to use an in-class definition here.
-      // In-class and inline definition bypass extern template mechanism.
-      template<class Arch, class T>
-      T operator()(Arch, T const* data, unsigned size);
-    };
-
-    template<class Arch, class T>
-    T sum::operator()(Arch, T const* data, unsigned size)
-    {
-      using batch = xsimd::batch<T, Arch>;
-      batch acc(static_cast<T>(0));
-      const unsigned n = size / batch::size * batch::size;
-      for(unsigned i = 0; i != n; i += batch::size)
-          acc += batch::load_unaligned(data + i);
-      T star_acc = xsimd::reduce_add(acc);
-      for(unsigned i = n; i < size; ++i)
-        star_acc += data[i];
-      return star_acc;
-    }
-
-    // Inform the compiler that sse2 and avx2 implementation are to be found in another compilaton unit.
-    extern template float sum::operator()<xsimd::avx2, float>(xsimd::avx2, float const*, unsigned);
-    extern template float sum::operator()<xsimd::sse2, float>(xsimd::sse2, float const*, unsigned);
-    #endif
 
 The SSE2 and AVX2 version needs to be provided in other compilation units, compiled with the appropriate flags, for instance:
 
-.. code-block:: c++
-
-    // compile with -mavx2
-    #include "sum.hpp"
-    template float sum::operator()<xsimd::avx2, float>(xsimd::avx2, float const*, unsigned);
+.. literalinclude:: ../../../test/doc/sum_avx2.cpp
 
-.. code-block:: c++
-
-    // compile with -msse2
-    #include "sum.hpp"
-    template float sum::operator()<xsimd::sse2, float>(xsimd::sse2, float const*, unsigned);
+.. literalinclude:: ../../../test/doc/sum_sse2.cpp
 
diff --git a/docs/source/basic_usage.rst b/docs/source/basic_usage.rst
@@ -13,14 +13,7 @@ Manipulating abstract batches
 Here is an example that computes the mean of two batches, using the best
 architecture available, based on compile time informations:
 
-.. code::
-
-    #include "xsimd/xsimd.hpp"
-
-    namespace xs = xsimd;
-    xs::batch<float> mean(xs::batch<float> lhs, xs::batch<float> rhs) {
-      return (lhs + rhs) / 2;
-    }
+.. literalinclude:: ../../test/doc/manipulating_abstract_batches.cpp
 
 The batch can be a batch of 4 single precision floating point numbers (e.g. on
 Neon) ot a batch of 8 (e.g. on AVX2).
@@ -32,15 +25,7 @@ The previous example can be made fully parametric, both in the batch type and
 the underlying architecture. This is achieved as described in the following
 example:
 
-.. code::
-
-    #include "xsimd/xsimd.hpp"
-
-    namespace xs = xsimd;
-    template<class T, class Arch>
-    xs::batch<T, Arch> mean(xs::batch<T, Arch> lhs, xs::batch<T, Arch> rhs) {
-      return (lhs + rhs) / 2;
-    }
+.. literalinclude:: ../../test/doc/manipulating_parametric_batches.cpp
 
 At its core, a :cpp:class:`xsimd::batch` is bound to the scalar type it contains, and to the
 instruction set it can use to operate on its values.
@@ -51,29 +36,15 @@ Explicit use of an instruction set extension
 Here is an example that loads two batches of 4 double floating point values, and
 computes their mean, explicitly using the AVX extension:
 
-.. code::
-
-    #include <iostream>
-    #include "xsimd/xsimd.hpp"
-
-    namespace xs = xsimd;
-
-    int main(int argc, char* argv[])
-    {
-        xs::batch<double, xs::avx> a = {1.5, 2.5, 3.5, 4.5};
-        xs::batch<double, xs::avx> b = {2.5, 3.5, 4.5, 5.5};
-        auto mean = (a + b) / 2;
-        std::cout << mean << std::endl;
-        return 0;
-    }
+.. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set.cpp
 
 Note that in that case, the instruction set is explicilty specified in the batch type.
 
 This example outputs:
 
 .. code::
 
-    (2.0, 3.0, 4.0, 5.0)
+   (2.0, 3.0, 4.0, 5.0)
 
 .. warning::
 

diff --git a/docs/source/vectorized_code.rst b/docs/source/vectorized_code.rst
@@ -9,19 +9,7 @@ Writing vectorized code
 
 Assume that we have a simple function that computes the mean of two vectors, something like:
 
-.. code::
-
-    #include <cstddef>
-    #include <vector>
-
-    void mean(const std::vector<double>& a, const std::vector<double>& b, std::vector<double>& res)
-    {
-        std::size_t size = res.size();
-        for(std::size_t i = 0; i < size; ++i)
-        {
-            res[i] = (a[i] + b[i]) / 2;
-        }
-    }
+.. literalinclude:: ../../test/doc/writing_vectorized_code.cpp
 
 How can we used `xsimd` to take advantage of vectorization?
 
@@ -32,32 +20,8 @@ Explicit use of an instruction set
 instructions and ``A`` is the target architecture. If you know which instruction set is available on your machine, you can directly use the corresponding specialization
 of ``batch``. For instance, assuming the AVX instruction set is available, the previous code can be vectorized the following way:
 
-.. code::
+.. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean.cpp
 
-    #include <cstddef>
-    #include <vector>
-    #include "xsimd/xsimd.hpp"
-
-    void mean(const std::vector<double>& a, const std::vector<double>& b, std::vector<double>& res)
-    {
-        using b_type = xsimd::batch<double, xsimd::avx>;
-        std::size_t inc = b_type::size;
-        std::size_t size = res.size();
-        // size for which the vectorization is possible
-        std::size_t vec_size = size - size % inc;
-        for(std::size_t i = 0; i < vec_size; i +=inc)
-        {
-            b_type avec = b_type::load_unaligned(&a[i]);
-            b_type bvec = b_type::load_unaligned(&b[i]);
-            b_type rvec = (avec + bvec) / 2;
-            rvec.store_unaligned(&res[i]);
-        }
-        // Remaining part that cannot be vectorize
-        for(std::size_t i = vec_size; i < size; ++i)
-        {
-            res[i] = (a[i] + b[i]) / 2;
-        }
-    }
 
 However, if you want to write code that is portable, you cannot rely on the use of ``batch<double, xsimd::avx>``.
 Indeed this won't compile on a CPU where only SSE2 instruction set is available for instance. Fortunately, if you don't set the second template parameter, `xsimd` picks the best architecture among the one available, based on the compiler flag you use.
@@ -75,33 +39,8 @@ operating on aligned memory is generally faster than operating on unaligned memo
 `xsimd` provides an aligned memory allocator, namely :cpp:class:`xsimd::aligned_allocator` which follows the standard requirements, so it can be used
 with STL containers. Let's change the previous code so it can take advantage of this allocator:
 
-.. code::
+.. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp
 
-    #include <cstddef>
-    #include <vector>
-    #include "xsimd/xsimd.hpp"
-
-    using vector_type = std::vector<double, xsimd::default_allocator<double>>;
-    void mean(const vector_type& a, const vector_type& b, vector_type& res)
-    {
-        using b_type = xsimd::batch<double>;
-        std::size_t inc = b_type::size;
-        std::size_t size = res.size();
-        // size for which the vectorization is possible
-        std::size_t vec_size = size - size % inc;
-        for(std::size_t i = 0; i < vec_size; i += inc)
-        {
-            b_type avec = b_type::load_aligned(&a[i]);
-            b_type bvec = b_type::load_aligned(&b[i]);
-            b_type rvec = (avec + bvec) / 2;
-            rvec.store_aligned(&res[i]);
-        }
-        // Remaining part that cannot be vectorize
-        for(std::size_t i = vec_size; i < size; ++i)
-        {
-            res[i] = (a[i] + b[i]) / 2;
-        }
-    }
 
 Memory alignment and tag dispatching
 ------------------------------------
@@ -111,33 +50,8 @@ case, you cannot make assumption on the memory alignment of the container. `xsim
 mechanism that allows you to easily write such a generic code:
 
 
-.. code::
+.. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp
 
-    #include <cstddef>
-    #include <vector>
-    #include "xsimd/xsimd.hpp"
-
-    template <class C, class Tag>
-    void mean(const C& a, const C& b, C& res, Tag)
-    {
-        using b_type = xsimd::batch<double>;
-        std::size_t inc = b_type::size;
-        std::size_t size = res.size();
-        // size for which the vectorization is possible
-        std::size_t vec_size = size - size % inc;
-        for(std::size_t i = 0; i < vec_size; i += inc)
-        {
-            b_type avec = b_type::load(&a[i], Tag());
-            b_type bvec = b_type::load(&b[i], Tag());
-            b_type rvec = (avec + bvec) / 2;
-            xsimd::store(&res[i], rvec, Tag());
-        }
-        // Remaining part that cannot be vectorize
-        for(std::size_t i = vec_size; i < size; ++i)
-        {
-            res[i] = (a[i] + b[i]) / 2;
-        }
-    }
 
 Here, the ``Tag`` template parameter can be :cpp:class:`xsimd::aligned_mode` or :cpp:class:`xsimd::unaligned_mode`. Assuming the existence
 of a ``get_alignment_tag`` meta-function in the code, the previous code can be invoked this way:
@@ -153,36 +67,7 @@ If your code may target either SSE2, AVX2 or AVX512 instruction set, `xsimd`
 make it possible to make your code even more generic by using the architecture
 as a template parameter:
 
-
-.. code::
-
-    #include <cstddef>
-    #include <vector>
-    #include "xsimd/xsimd.hpp"
-
-    struct mean {
-        template <class C, class Tag, class Arch>
-        void operator()(Arch, const C& a, const C& b, C& res, Tag)
-        {
-            using b_type = xsimd::batch<double, Arch>;
-            std::size_t inc = b_type::size;
-            std::size_t size = res.size();
-            // size for which the vectorization is possible
-            std::size_t vec_size = size - size % inc;
-            for(std::size_t i = 0; i < vec_size; i += inc)
-            {
-                b_type avec = b_type::load(&a[i], Tag());
-                b_type bvec = b_type::load(&b[i], Tag());
-                b_type rvec = (avec + bvec) / 2;
-                xsimd::store(&res[i], rvec, Tag());
-            }
-            // Remaining part that cannot be vectorize
-            for(std::size_t i = vec_size; i < size; ++i)
-            {
-                res[i] = (a[i] + b[i]) / 2;
-            }
-        }
-    };
+.. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp
 
 This can be useful to implement runtime dispatching, based on the instruction set detected at runtime. `xsimd` provides a generic machinery :cpp:func:`xsimd::dispatch()` to implement
 this pattern. Based on the above example, instead of calling ``mean{}(arch, a, b, res, tag)``, one can use ``xsimd::dispatch(mean{})(a, b, res, tag)``. More about this can be found in the :ref:`Arch Dispatching` section.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -224,4 +224,5 @@ if (XSIMD_ENABLE_WERROR)
     target_compile_options(test_xsimd PRIVATE -Werror -Wall -DXSIMD_SKIP_ON_WERROR)
 endif()
 
+add_subdirectory(doc)
 
diff --git a/test/doc/CMakeLists.txt b/test/doc/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Only test under some architecture, because it's just a sanity check, no full
+# coverage is needed.
+
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT CMAKE_OSX_ARCHITECTURES)
+
+add_library(test_doc_any_arch OBJECT
+            explicit_use_of_an_instruction_set_mean_aligned.cpp
+            explicit_use_of_an_instruction_set_mean_arch_independent.cpp
+            explicit_use_of_an_instruction_set_mean.cpp
+            explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp
+            manipulating_abstract_batches.cpp
+            manipulating_parametric_batches.cpp
+            writing_vectorized_code.cpp)
+target_include_directories(test_doc_any_arch PRIVATE ${XSIMD_INCLUDE_DIR})
+target_compile_options(test_doc_any_arch PRIVATE -mavx)
+
+add_library(test_doc_avx2 OBJECT
+            explicit_use_of_an_instruction_set.cpp
+            sum_avx2.cpp)
+target_compile_options(test_doc_avx2 PRIVATE -mavx2)
+target_include_directories(test_doc_avx2 PRIVATE ${XSIMD_INCLUDE_DIR})
+
+add_library(test_doc_sse2 OBJECT
+            sum_sse2.cpp)
+target_compile_options(test_doc_sse2 PRIVATE -msse2)
+target_include_directories(test_doc_sse2 PRIVATE ${XSIMD_INCLUDE_DIR})
+
+add_dependencies(xtest test_doc_any_arch test_doc_avx2 test_doc_sse2)
+
+endif()
diff --git a/test/doc/explicit_use_of_an_instruction_set.cpp b/test/doc/explicit_use_of_an_instruction_set.cpp
@@ -0,0 +1,13 @@
+#include "xsimd/xsimd.hpp"
+#include <iostream>
+
+namespace xs = xsimd;
+
+int main(int argc, char* argv[])
+{
+    xs::batch<double, xs::avx> a = { 1.5, 2.5, 3.5, 4.5 };
+    xs::batch<double, xs::avx> b = { 2.5, 3.5, 4.5, 5.5 };
+    auto mean = (a + b) / 2;
+    std::cout << mean << std::endl;
+    return 0;
+}
diff --git a/test/doc/explicit_use_of_an_instruction_set_mean.cpp b/test/doc/explicit_use_of_an_instruction_set_mean.cpp
@@ -0,0 +1,24 @@
+#include "xsimd/xsimd.hpp"
+#include <cstddef>
+#include <vector>
+
+void mean(const std::vector<double>& a, const std::vector<double>& b, std::vector<double>& res)
+{
+    using b_type = xsimd::batch<double, xsimd::avx>;
+    std::size_t inc = b_type::size;
+    std::size_t size = res.size();
+    // size for which the vectorization is possible
+    std::size_t vec_size = size - size % inc;
+    for (std::size_t i = 0; i < vec_size; i += inc)
+    {
+        b_type avec = b_type::load_unaligned(&a[i]);
+        b_type bvec = b_type::load_unaligned(&b[i]);
+        b_type rvec = (avec + bvec) / 2;
+        rvec.store_unaligned(&res[i]);
+    }
+    // Remaining part that cannot be vectorize
+    for (std::size_t i = vec_size; i < size; ++i)
+    {
+        res[i] = (a[i] + b[i]) / 2;
+    }
+}
diff --git a/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp b/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp
@@ -0,0 +1,25 @@
+#include "xsimd/xsimd.hpp"
+#include <cstddef>
+#include <vector>
+
+using vector_type = std::vector<double, xsimd::default_allocator<double>>;
+void mean(const vector_type& a, const vector_type& b, vector_type& res)
+{
+    using b_type = xsimd::batch<double>;
+    std::size_t inc = b_type::size;
+    std::size_t size = res.size();
+    // size for which the vectorization is possible
+    std::size_t vec_size = size - size % inc;
+    for (std::size_t i = 0; i < vec_size; i += inc)
+    {
+        b_type avec = b_type::load_aligned(&a[i]);
+        b_type bvec = b_type::load_aligned(&b[i]);
+        b_type rvec = (avec + bvec) / 2;
+        rvec.store_aligned(&res[i]);
+    }
+    // Remaining part that cannot be vectorize
+    for (std::size_t i = vec_size; i < size; ++i)
+    {
+        res[i] = (a[i] + b[i]) / 2;
+    }
+}