From e3ab51704449ed804a52b97aeef57e369234e425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Thu, 4 Feb 2021 13:09:55 +0100 Subject: [PATCH 01/33] initial thoughts parallel strided assign --- include/xtensor/xassign.hpp | 231 +++++++++++++++++++++++++++++++++++- 1 file changed, 229 insertions(+), 2 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 1c5535113..79db1b1f5 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -850,6 +850,80 @@ namespace xt } } } + + template + static void nth_next_idx(size_t n, T&outer_index, T& outer_shape) + { + auto i = outer_index.size()-1; + auto stride_sizes = full_like(outer_shape,0); + stride_sizes[i] = 1; + // fill up dimensions in the outer_idx as required + for (; i >= 0; i--) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i]-1)) { + outer_index[i] += d_idx; + n-=d_idx * stride_sizes[i]; + // we filled up all dimensions required + break; + } + else if (outer_index[i] !=0) { + outer_index[i] = 0; + n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; + if (i>0) { + outer_index[i-1]++; + // compute stride sizes on the fly ... + stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; + } + } + else { + if (i>0) { + // compute stride sizes on the fly ... + stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; + } + } + } + i++; + // add remaining iterations + for (; i < outer_index.size(); i++) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i])) { + outer_index[i] += d_idx; + n-=d_idx * stride_sizes[i]; + } + else { + // that may not happen + } + } + } + + template + static void nth_idx(size_t n, T& outer_index, T& outer_shape) + { + // assume outer_index is filled with 0 + auto stride_sizes = full_like(outer_shape,0); + + auto i = outer_index.size()-1; + stride_sizes[i] = 1; + i-- + // compute strides + for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; } + + // compute index + for (i=0; i < outer_index.size(); i++) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx < outer_shape[i]) { + outer_index[i] += d_idx; + n -= d_idx * stride_sizes[i]; + } + else { + // that may not happen + } + } + } + }; template <> @@ -874,6 +948,83 @@ namespace xt } } } + + template + static void nth_next_idx(size_t n, T& outer_index, T& outer_shape) + { + using size_type = typename T::size_type; + size_type i = 0; + auto stride_sizes = outer_shape; + stride_sizes[i] = 1; + // fill up dimensions in the outer_idx as required + for (; i < outer_index.size(); i++) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i])) { + outer_index[i] += d_idx; + n-=d_idx * stride_sizes[i]; + // we filled up all dimensions required + break; + } + else if (outer_index[i] != 0) { + outer_index[i] = 0; + n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; + + if (i+1= 0; i--) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i]-1)) { + outer_index[i] += d_idx; + n -= d_idx * stride_sizes[i]; + } + else { + // that may not happen + } + } + } + + template + static void nth_idx(size_t n, T& outer_index, T& outer_shape) + { + // assume outer_index is filled with 0 + using size_type = typename T::size_type; + auto stride_sizes = outer_shape; + size_type i = 0; + stride_sizes[i] = 1; + i++; + + // compute required strides + for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; } + + // compute index + i = outer_shape.size()-1; + for (; i >= 0; i--) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-1)) { + outer_index[i] += d_idx; + n -= d_idx * stride_sizes[i]; + } + else { + // that may not happen + } + } + } }; template @@ -969,6 +1120,8 @@ namespace xt } } +#define strided_row_major_fallback +#define strided_parallel_assign template template inline void strided_loop_assigner::run(E1& e1, const E2& e2) @@ -988,7 +1141,11 @@ namespace xt is_row_major = false; break; default: +#ifdef strided_row_major_fallback + is_row_major = true; +#else return fallback_assigner(e1, e2).run(); +#endif } } else if (E1::static_layout == layout_type::row_major) @@ -1054,7 +1211,75 @@ namespace xt { step_dim = cut; } - +#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign) + static bool reported = false; + if (!reported) { + std::cout << "SIMD size is " << simd_type::size << " step_dim " << step_dim << std::endl; + reported = true; + } + std::cout << "+"; + std::size_t first_step = true; +// #define old_parallel +#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) + for (std::size_t ox = 0; ox < outer_loop_size; ++ox) + { + if (first_step) { + is_row_major ? + strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape) : + strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + first_step = false; + } + else { + // next unaligned index + if (is_row_major) { + auto i = max_shape.size()-1; +// fct_stepper.step_back(i+1,inner_loop_size); +// res_stepper.step_back(i+1,inner_loop_size); + fct_stepper.reset(i+1); + res_stepper.reset(i+1); + fct_stepper.step_back(i+1,1); + res_stepper.step_back(i+1,1); + + + for (; i >= 0; --i) + { + if (idx[i] + 1 >= max_shape[i]) + { + idx[i] = 0; + fct_stepper.reset(i); + res_stepper.reset(i); + } + else + { + idx[i]++; + fct_stepper.step(i, 1); + res_stepper.step(i, 1); + break; + } + } + } + else { + std::cout << "oops" << std::endl; + } +// std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<(fct_stepper.template step_simd()); + } + for (std::size_t i = 0; i < simd_rest; ++i) + { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } + } +#else for (std::size_t ox = 0; ox < outer_loop_size; ++ox) { for (std::size_t i = 0; i < simd_size; ++i) @@ -1092,7 +1317,9 @@ namespace xt } } } - } + +#endif + } template <> template From 2087ed04b4e7b5865805be9908f9ea454b9f28f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Thu, 27 May 2021 23:30:24 +0200 Subject: [PATCH 02/33] trigger strided assign properly --- include/xtensor/xassign.hpp | 568 +++++++++-------------------- include/xtensor/xstrides.hpp | 2 + include/xtensor/xview.hpp | 686 ++++++++++++++++------------------- 3 files changed, 488 insertions(+), 768 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 79db1b1f5..7c67da273 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1,30 +1,30 @@ /*************************************************************************** - * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * - * Copyright (c) QuantStack * - * * - * Distributed under the terms of the BSD 3-Clause License. * - * * - * The full license is in the file LICENSE, distributed with this software. * - ****************************************************************************/ +* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * +* Copyright (c) QuantStack * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ #ifndef XTENSOR_ASSIGN_HPP #define XTENSOR_ASSIGN_HPP #include -#include #include #include +#include #include #include #include "xexpression.hpp" -#include "xfunction.hpp" #include "xiterator.hpp" #include "xstrides.hpp" #include "xtensor_config.hpp" #include "xtensor_forward.hpp" #include "xutils.hpp" +#include "xfunction.hpp" #if defined(XTENSOR_USE_TBB) #include @@ -100,6 +100,7 @@ namespace xt template static bool resize(E1& e1, const xfunction& e2); + }; /******************** @@ -195,18 +196,14 @@ namespace xt template inline void assign_xexpression(xexpression& e1, const xexpression& e2) { - xtl::mpl::static_if::value>( - [&](auto self) - { - self(e2).derived_cast().assign_to(e1); - }, - /*else*/ - [&](auto /*self*/) - { - using tag = xexpression_tag_t; - xexpression_assigner::assign_xexpression(e1, e2); - } - ); + xtl::mpl::static_if::value>([&](auto self) + { + self(e2).derived_cast().assign_to(e1); + }, /*else*/ [&](auto /*self*/) + { + using tag = xexpression_tag_t; + xexpression_assigner::assign_xexpression(e1, e2); + }); } template @@ -242,19 +239,15 @@ namespace xt // A row_major or column_major container with a dimension <= 1 is computed as // layout any, leading to some performance improvements, for example when // assigning a col-major vector to a row-major vector etc - return compute_layout( - select_layout::value, - select_layout::value - ) - != layout_type::dynamic; + return compute_layout(select_layout::value, + select_layout::value) != layout_type::dynamic; } template - inline auto is_linear_assign(const E1& e1, const E2& e2) - -> std::enable_if_t::value, bool> + inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t::value, bool> { - return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout()) - || (e1.layout() != layout_type::dynamic && e2.has_linear_assign(e1.strides())); + return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout()) || + (e1.is_contiguous() && e2.has_linear_assign(e1.strides())); } template @@ -266,8 +259,9 @@ namespace xt template inline bool linear_dynamic_layout(const E1& e1, const E2& e2) { - return e1.is_contiguous() && e2.is_contiguous() - && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic; + return e1.is_contiguous() + && e2.is_contiguous() + && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic; } template @@ -276,20 +270,16 @@ namespace xt }; template - struct has_step_leading().step_leading())>> : std::true_type + struct has_step_leading().step_leading())>> + : std::true_type { }; template struct use_strided_loop { - static constexpr bool stepper_deref() - { - return std::is_reference::value; - } - - static constexpr bool value = has_strides::value - && has_step_leading::value && stepper_deref(); + static constexpr bool stepper_deref() { return std::is_reference::value; } + static constexpr bool value = has_strides::value && has_step_leading::value && stepper_deref(); }; template @@ -343,89 +333,43 @@ namespace xt template using is_bool = std::is_same; - static constexpr bool is_bool_conversion() - { - return is_bool::value && !is_bool::value; - } - - static constexpr bool contiguous_layout() - { - return E1::contiguous_layout && E2::contiguous_layout; - } + static constexpr bool is_bool_conversion() { return is_bool::value && !is_bool::value; } + static constexpr bool contiguous_layout() { return E1::contiguous_layout && E2::contiguous_layout; } + static constexpr bool convertible_types() { return std::is_convertible::value + && !is_bool_conversion(); } - static constexpr bool convertible_types() - { - return std::is_convertible::value && !is_bool_conversion(); - } - - static constexpr bool use_xsimd() - { - return xt_simd::simd_traits::size > 1; - } + static constexpr bool use_xsimd() { return xt_simd::simd_traits::size > 1; } template - static constexpr bool simd_size_impl() - { - return xt_simd::simd_traits::size > 1 || (is_bool::value && use_xsimd()); - } - - static constexpr bool simd_size() - { - return simd_size_impl() && simd_size_impl(); - } - - static constexpr bool simd_interface() - { - return has_simd_interface() - && has_simd_interface(); - } + static constexpr bool simd_size_impl() { return xt_simd::simd_traits::size > 1 || (is_bool::value && use_xsimd()); } + static constexpr bool simd_size() { return simd_size_impl() && simd_size_impl(); } + static constexpr bool simd_interface() { return has_simd_interface() && + has_simd_interface(); } public: // constexpr methods instead of constexpr data members avoid the need of definitions at namespace // scope of these data members (since they are odr-used). - static constexpr bool simd_assign() - { - return convertible_types() && simd_size() && simd_interface(); - } - - static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial) - { - return trivial && detail::is_linear_assign(e1, e2); - } - - static constexpr bool strided_assign() - { - return detail::use_strided_loop::value && detail::use_strided_loop::value; - } + static constexpr bool simd_assign() { return convertible_types() && simd_size() && simd_interface(); } + static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial) { return trivial && detail::is_linear_assign(e1, e2); } + static constexpr bool strided_assign() { return detail::use_strided_loop::value && detail::use_strided_loop::value; } + static constexpr bool simd_linear_assign() { return contiguous_layout() && simd_assign(); } + static constexpr bool simd_strided_assign() { return strided_assign() && simd_assign(); } - static constexpr bool simd_linear_assign() - { - return contiguous_layout() && simd_assign(); - } - - static constexpr bool simd_strided_assign() - { - return strided_assign() && simd_assign(); - } + static constexpr bool simd_linear_assign(const E1& e1, const E2& e2) { return simd_assign() + && detail::linear_dynamic_layout(e1, e2); } - static constexpr bool simd_linear_assign(const E1& e1, const E2& e2) - { - return simd_assign() && detail::linear_dynamic_layout(e1, e2); - } + using e2_requested_value_type = std::conditional_t::value, + typename E2::bool_load_type, + e2_value_type>; + using requested_value_type = detail::conditional_promote_to_complex_t; - using e2_requested_value_type = std:: - conditional_t::value, typename E2::bool_load_type, e2_value_type>; - using requested_value_type = detail::conditional_promote_to_complex_t; }; template - inline void xexpression_assigner_base::assign_data( - xexpression& e1, - const xexpression& e2, - bool trivial - ) + inline void xexpression_assigner_base::assign_data(xexpression& e1, const xexpression& e2, bool trivial) { E1& de1 = e1.derived_cast(); const E2& de2 = e2.derived_cast(); @@ -437,7 +381,7 @@ namespace xt constexpr bool simd_strided_assign = traits::simd_strided_assign(); if (linear_assign) { - if (simd_linear_assign || traits::simd_linear_assign(de1, de2)) + if(simd_linear_assign || traits::simd_linear_assign(de1, de2)) { // Do not use linear_assigner here since it will make the compiler // instantiate this branch even if the runtime condition is false, resulting @@ -451,7 +395,7 @@ namespace xt linear_assigner::run(de1, de2); } } - else if (simd_strided_assign) + else if (simd_strided_assign && de1.layout() == de2.layout()) { strided_loop_assigner::run(de1, de2); } @@ -487,15 +431,8 @@ namespace xt bool trivial_broadcast = de2.broadcast_shape(shape, true); - auto&& de1_shape = de1.shape(); - if (dim2 > de1.dimension() - || std::lexicographical_compare( - shape.begin(), - shape.end(), - de1_shape.begin(), - de1_shape.end(), - comperator_type() - )) + auto && de1_shape = de1.shape(); + if (dim2 > de1.dimension() || std::lexicographical_compare(shape.begin(), shape.end(), de1_shape.begin(), de1_shape.end(), comperator_type())) { typename E1::temporary_type tmp(shape); base_type::assign_data(tmp, e2, trivial_broadcast); @@ -523,8 +460,7 @@ namespace xt template template - inline void - xexpression_assigner::assert_compatible_shape(const xexpression& e1, const xexpression& e2) + inline void xexpression_assigner::assert_compatible_shape(const xexpression& e1, const xexpression& e2) { const E1& de1 = e1.derived_cast(); const E2& de2 = e2.derived_cast(); @@ -568,20 +504,16 @@ namespace xt inline bool xexpression_assigner::resize(E1& e1, const xfunction& e2) { return xtl::mpl::static_if::shape_type>::value>( - [&](auto /*self*/) - { + [&](auto /*self*/) { /* * If the shape of the xfunction is statically known, we can compute the broadcast triviality * at compile time plus we can resize right away. */ // resize in case LHS is not a fixed size container. If it is, this is a NOP e1.resize(typename xfunction::shape_type{}); - return detail::static_trivial_broadcast< - detail::is_fixed::shape_type>::value, - CT...>::value; + return detail::static_trivial_broadcast::shape_type>::value, CT...>::value; }, - /* else */ - [&](auto /*self*/) + /* else */ [&](auto /*self*/) { using index_type = xindex_type_t; using size_type = typename E1::size_type; @@ -604,10 +536,9 @@ namespace xt using argument_type = std::decay_t; using result_type = std::decay_t; - static const bool value = xtl::is_arithmetic::value - && (sizeof(result_type) < sizeof(argument_type) - || (xtl::is_integral::value - && std::is_floating_point::value)); + static const bool value = xtl::is_arithmetic::value && + (sizeof(result_type) < sizeof(argument_type) || + (xtl::is_integral::value && std::is_floating_point::value)); }; template @@ -625,16 +556,15 @@ namespace xt using argument_type = std::decay_t; using result_type = std::decay_t; - static const bool value = is_narrowing_conversion::value - || has_sign_conversion::value; + static const bool value = is_narrowing_conversion::value || + has_sign_conversion::value; }; template inline stepper_assigner::stepper_assigner(E1& e1, const E2& e2) - : m_e1(e1) - , m_lhs(e1.stepper_begin(e1.shape())) - , m_rhs(e2.stepper_begin(e1.shape())) - , m_index(xtl::make_sequence(e1.shape().size(), size_type(0))) + : m_e1(e1), m_lhs(e1.stepper_begin(e1.shape())), + m_rhs(e2.stepper_begin(e1.shape())), + m_index(xtl::make_sequence(e1.shape().size(), size_type(0))) { } @@ -690,6 +620,7 @@ namespace xt template inline void linear_assigner::run(E1& e1, const E2& e2) { +// std::cout << "lin" << std::endl; using lhs_align_mode = xt_simd::container_alignment_t; constexpr bool is_aligned = std::is_same::value; using rhs_align_mode = std::conditional_t; @@ -713,18 +644,10 @@ namespace xt #if defined(XTENSOR_USE_TBB) if (size >= XTENSOR_TBB_THRESHOLD) { - tbb::parallel_for( - align_begin, - align_end, - simd_size, - [&e1, &e2](size_t i) - { - e1.template store_simd( - i, - e2.template load_simd(i) - ); - } - ); + tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i) + { + e1.template store_simd(i, e2.template load_simd(i)); + }); } else { @@ -736,20 +659,20 @@ namespace xt #elif defined(XTENSOR_USE_OPENMP) if (size >= XTENSOR_OPENMP_TRESHOLD) { -#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) -#ifndef _WIN32 + #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) + #ifndef _WIN32 for (size_type i = align_begin; i < align_end; i += simd_size) { e1.template store_simd(i, e2.template load_simd(i)); } -#else - for (auto i = static_cast(align_begin); i < static_cast(align_end); - i += static_cast(simd_size)) + #else + for(auto i = static_cast(align_begin); i < static_cast(align_end); + i += static_cast(simd_size)) { size_type ui = static_cast(i); e1.template store_simd(ui, e2.template load_simd(ui)); } -#endif + #endif } else { @@ -773,8 +696,8 @@ namespace xt template inline void linear_assigner::run(E1& e1, const E2& e2) { - using is_convertible = std:: - is_convertible::value_type, typename std::decay_t::value_type>; + using is_convertible = std::is_convertible::value_type, + typename std::decay_t::value_type>; // If the types are not compatible, this function is still instantiated but never called. // To avoid compilation problems in effectively unused code trivial_assigner_run_impl is // empty in this case. @@ -789,19 +712,15 @@ namespace xt auto src = linear_begin(e2); auto dst = linear_begin(e1); size_type n = e1.size(); - +// std::cout << "Linear assigner LOOP SIZE is " << n <(n), - [&](std::ptrdiff_t i) - { - *(dst + i) = static_cast(*(src + i)); - } - ); + tbb::parallel_for(std::ptrdiff_t(0), static_cast(n), [&](std::ptrdiff_t i) + { + *(dst + i) = static_cast(*(src + i)); + }); #elif defined(XTENSOR_USE_OPENMP) -#pragma omp parallel for default(none) shared(src, dst, n) - for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n); i++) + #pragma omp parallel for default(none) shared(src, dst, n) + for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) { *(dst + i) = static_cast(*(src + i)); } @@ -818,7 +737,9 @@ namespace xt template inline void linear_assigner::run_impl(E1&, const E2&, std::false_type /*is_convertible*/) { - XTENSOR_PRECONDITION(false, "Internal error: linear_assigner called with unrelated types."); + XTENSOR_PRECONDITION(false, + "Internal error: linear_assigner called with unrelated types."); + } /**************************************** @@ -851,75 +772,29 @@ namespace xt } } - template - static void nth_next_idx(size_t n, T&outer_index, T& outer_shape) - { - auto i = outer_index.size()-1; - auto stride_sizes = full_like(outer_shape,0); - stride_sizes[i] = 1; - // fill up dimensions in the outer_idx as required - for (; i >= 0; i--) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i]-1)) { - outer_index[i] += d_idx; - n-=d_idx * stride_sizes[i]; - // we filled up all dimensions required - break; - } - else if (outer_index[i] !=0) { - outer_index[i] = 0; - n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; - if (i>0) { - outer_index[i-1]++; - // compute stride sizes on the fly ... - stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; - } - } - else { - if (i>0) { - // compute stride sizes on the fly ... - stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; - } - } - } - i++; - // add remaining iterations - for (; i < outer_index.size(); i++) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i])) { - outer_index[i] += d_idx; - n-=d_idx * stride_sizes[i]; - } - else { - // that may not happen - } - } - } - template - static void nth_idx(size_t n, T& outer_index, T& outer_shape) + static void nth_idx(size_t n, T& outer_index, const T& outer_shape) { - // assume outer_index is filled with 0 - auto stride_sizes = full_like(outer_shape,0); - auto i = outer_index.size()-1; - stride_sizes[i] = 1; - i-- + dynamic_shape stride_sizes; + xt::resize_container(stride_sizes, outer_shape.size()); // compute strides - for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; } + using size_type = typename T::size_type; + for ( size_type i = outer_shape.size(); i>0; i--) { + stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i]; + } // compute index - for (i=0; i < outer_index.size(); i++) + for (size_type i=0; i < outer_shape.size(); i++) { auto d_idx = n/stride_sizes[i]; if (d_idx < outer_shape[i]) { - outer_index[i] += d_idx; + outer_index[i] = d_idx; n -= d_idx * stride_sizes[i]; } else { // that may not happen + outer_index[i] = 0; } } } @@ -949,80 +824,27 @@ namespace xt } } - template - static void nth_next_idx(size_t n, T& outer_index, T& outer_shape) - { - using size_type = typename T::size_type; - size_type i = 0; - auto stride_sizes = outer_shape; - stride_sizes[i] = 1; - // fill up dimensions in the outer_idx as required - for (; i < outer_index.size(); i++) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i])) { - outer_index[i] += d_idx; - n-=d_idx * stride_sizes[i]; - // we filled up all dimensions required - break; - } - else if (outer_index[i] != 0) { - outer_index[i] = 0; - n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; - - if (i+1= 0; i--) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i]-1)) { - outer_index[i] += d_idx; - n -= d_idx * stride_sizes[i]; - } - else { - // that may not happen - } - } - } - template - static void nth_idx(size_t n, T& outer_index, T& outer_shape) + static void nth_idx(size_t n, T& outer_index, const T& outer_shape) { - // assume outer_index is filled with 0 + dynamic_shape stride_sizes; + xt::resize_container(stride_sizes, outer_shape.size()); + using size_type = typename T::size_type; - auto stride_sizes = outer_shape; - size_type i = 0; - stride_sizes[i] = 1; - i++; // compute required strides - for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; } + for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; } // compute index - i = outer_shape.size()-1; - for (; i >= 0; i--) - { + for (size_type i = outer_shape.size(); i>0;) + { + i--; auto d_idx = n/stride_sizes[i]; if (d_idx<(outer_shape[i]-1)) { - outer_index[i] += d_idx; + outer_index[i] = d_idx; n -= d_idx * stride_sizes[i]; } - else { - // that may not happen - } +// else {} // that may not happen } } }; @@ -1033,13 +855,14 @@ namespace xt using strides_type = S; check_strides_functor(const S& strides) - : m_cut(L == layout_type::row_major ? 0 : strides.size()) - , m_strides(strides) + : m_cut(L == layout_type::row_major ? 0 : strides.size()), + m_strides(strides) { } template - std::enable_if_t operator()(const T& el) + std::enable_if_t + operator()(const T& el) { auto var = check_strides_overlap::get(m_strides, el.strides()); if (var > m_cut) @@ -1050,7 +873,8 @@ namespace xt } template - std::enable_if_t operator()(const T& el) + std::enable_if_t + operator()(const T& el) { auto var = check_strides_overlap::get(m_strides, el.strides()); if (var < m_cut) @@ -1089,27 +913,22 @@ namespace xt { auto csf = check_strides_functor(e1.strides()); cut = csf(e2); + if (cut(e1.strides() - ); + auto csf = check_strides_functor(e1.strides()); cut = csf(e2); - } // can't reach here because this would have already triggered the fallback + if (cut>1) cut = 1; + } // can't reach here because this would have already triggered the fallback using shape_value_type = typename E1::shape_type::value_type; - std::size_t outer_loop_size = static_cast(std::accumulate( - e1.shape().begin(), - e1.shape().begin() + static_cast(cut), - shape_value_type(1), - std::multiplies{} - )); - std::size_t inner_loop_size = static_cast(std::accumulate( - e1.shape().begin() + static_cast(cut), - e1.shape().end(), - shape_value_type(1), - std::multiplies{} - )); + std::size_t outer_loop_size = static_cast( + std::accumulate(e1.shape().begin(), e1.shape().begin() + static_cast(cut), + shape_value_type(1), std::multiplies{})); + std::size_t inner_loop_size = static_cast( + std::accumulate(e1.shape().begin() + static_cast(cut), e1.shape().end(), + shape_value_type(1), std::multiplies{})); if (E1::static_layout == layout_type::column_major || !is_row_major) { @@ -1120,7 +939,6 @@ namespace xt } } -#define strided_row_major_fallback #define strided_parallel_assign template template @@ -1128,10 +946,10 @@ namespace xt { bool is_row_major = true; using fallback_assigner = stepper_assigner; - if (E1::static_layout == layout_type::dynamic) { layout_type dynamic_layout = e1.layout(); + switch (dynamic_layout) { case layout_type::row_major: @@ -1141,11 +959,7 @@ namespace xt is_row_major = false; break; default: -#ifdef strided_row_major_fallback - is_row_major = true; -#else return fallback_assigner(e1, e2).run(); -#endif } } else if (E1::static_layout == layout_type::row_major) @@ -1162,11 +976,7 @@ namespace xt } std::size_t inner_loop_size, outer_loop_size, cut; - std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes( - e1, - e2, - is_row_major - ); + std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(e1, e2, is_row_major); if ((is_row_major && cut == e1.dimension()) || (!is_row_major && cut == 0)) { @@ -1193,10 +1003,9 @@ namespace xt using e2_value_type = typename E2::value_type; constexpr bool needs_cast = has_assign_conversion::value; using value_type = typename xassign_traits::requested_value_type; - using simd_type = std::conditional_t< - std::is_same::value, - xt_simd::simd_bool_type, - xt_simd::simd_type>; + using simd_type = std::conditional_t::value, + xt_simd::simd_bool_type, + xt_simd::simd_type>; std::size_t simd_size = inner_loop_size / simd_type::size; std::size_t simd_rest = inner_loop_size % simd_type::size; @@ -1207,79 +1016,59 @@ namespace xt // TODO in 1D case this is ambigous -- could be RM or CM. // Use default layout to make decision std::size_t step_dim = 0; - if (!is_row_major) // row major case + if (!is_row_major) // row major case { step_dim = cut; } +// std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <50) { + std::size_t first_step = true; #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) - for (std::size_t ox = 0; ox < outer_loop_size; ++ox) - { - if (first_step) { - is_row_major ? - strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape) : - strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape); - for (std::size_t i = 0; i < idx.size(); ++i) - { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); - } - first_step = false; - } - else { - // next unaligned index - if (is_row_major) { - auto i = max_shape.size()-1; -// fct_stepper.step_back(i+1,inner_loop_size); -// res_stepper.step_back(i+1,inner_loop_size); - fct_stepper.reset(i+1); - res_stepper.reset(i+1); - fct_stepper.step_back(i+1,1); - res_stepper.step_back(i+1,1); + for (std::size_t ox = 0; ox < outer_loop_size; ++ox) + { + if (first_step) { + is_row_major ? + strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape) : + strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape); - - for (; i >= 0; --i) + for (std::size_t i = 0; i < idx.size(); ++i) { - if (idx[i] + 1 >= max_shape[i]) - { - idx[i] = 0; - fct_stepper.reset(i); - res_stepper.reset(i); - } - else - { - idx[i]++; - fct_stepper.step(i, 1); - res_stepper.step(i, 1); - break; - } + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); } + first_step = false; } - else { - std::cout << "oops" << std::endl; + + for (std::size_t i = 0; i < simd_size; ++i) + { + res_stepper.template store_simd(fct_stepper.template step_simd()); + } + for (std::size_t i = 0; i < simd_rest; ++i) + { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } + + // next unaligned index + is_row_major ? + strided_assign_detail::idx_tools::next_idx(idx, max_shape) : + strided_assign_detail::idx_tools::next_idx(idx, max_shape); + + // need to step E1 as well if not contigous assign (e.g. view) + fct_stepper.to_begin(); + res_stepper.to_begin(); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); } -// std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<(fct_stepper.template step_simd()); - } - for (std::size_t i = 0; i < simd_rest; ++i) - { - *(res_stepper) = conditional_cast(*(fct_stepper)); - res_stepper.step_leading(); - fct_stepper.step_leading(); } - } -#else + } + else { + +#endif for (std::size_t ox = 0; ox < outer_loop_size; ++ox) { for (std::size_t i = 0; i < simd_size; ++i) @@ -1293,9 +1082,9 @@ namespace xt fct_stepper.step_leading(); } - is_row_major - ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) - : strided_assign_detail::idx_tools::next_idx(idx, max_shape); + is_row_major ? + strided_assign_detail::idx_tools::next_idx(idx, max_shape) : + strided_assign_detail::idx_tools::next_idx(idx, max_shape); fct_stepper.to_begin(); @@ -1317,7 +1106,8 @@ namespace xt } } } - +#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign) + } #endif } diff --git a/include/xtensor/xstrides.hpp b/include/xtensor/xstrides.hpp index ae261e354..99ffa811c 100644 --- a/include/xtensor/xstrides.hpp +++ b/include/xtensor/xstrides.hpp @@ -544,8 +544,10 @@ namespace xt { for (std::size_t i = strides.size(); i != 0; --i) { +// std::cout << "matching: " << strides[i - 1] << " && " << shape[i - 1]<< " && " << data_size << std::endl; if (!stride_match_condition(strides[i - 1], shape[i - 1], data_size, zero_strides)) { +// std::cout << "non-matching: " << strides[i - 1] << " != " << data_size << std::endl; return false; } data_size *= static_cast(shape[i - 1]); diff --git a/include/xtensor/xview.hpp b/include/xtensor/xview.hpp index 0a2dbb6be..931bb67a9 100644 --- a/include/xtensor/xview.hpp +++ b/include/xtensor/xview.hpp @@ -1,11 +1,11 @@ /*************************************************************************** - * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * - * Copyright (c) QuantStack * - * * - * Distributed under the terms of the BSD 3-Clause License. * - * * - * The full license is in the file LICENSE, distributed with this software. * - ****************************************************************************/ +* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * +* Copyright (c) QuantStack * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ #ifndef XTENSOR_VIEW_HPP #define XTENSOR_VIEW_HPP @@ -18,8 +18,8 @@ #include #include -#include #include +#include #include #include "xaccessible.hpp" @@ -34,6 +34,7 @@ #include "xtensor_forward.hpp" #include "xview_utils.hpp" + namespace xt { @@ -75,22 +76,26 @@ namespace xt { template - struct is_xrange : std::false_type + struct is_xrange + : std::false_type { }; template - struct is_xrange> : std::true_type + struct is_xrange> + : std::true_type { }; template - struct is_xall_slice : std::false_type + struct is_xall_slice + : std::false_type { }; template - struct is_xall_slice> : std::true_type + struct is_xall_slice> + : std::true_type { }; @@ -129,33 +134,32 @@ namespace xt template struct is_xscalar_impl> { - static constexpr bool value = static_cast(integral_count() - ) == static_dimension::shape_type>::value - ? true - : false; + static constexpr bool value = static_cast(integral_count()) == static_dimension::shape_type>::value ? true : false; }; template - struct is_strided_slice_impl : std::true_type + struct is_strided_slice_impl + : std::true_type { }; template - struct is_strided_slice_impl> : std::false_type + struct is_strided_slice_impl> + : std::false_type { }; template - struct is_strided_slice_impl> : std::false_type + struct is_strided_slice_impl> + : std::false_type { }; // If we have no discontiguous slices, we can calculate strides for this view. template struct is_strided_view - : std::integral_constant< - bool, - xtl::conjunction, is_strided_slice_impl>...>::value> + : std::integral_constant, + is_strided_slice_impl>...>::value> { }; @@ -179,20 +183,25 @@ namespace xt static constexpr bool have_all_seen = all_seen || is_all_slice; static constexpr bool have_range_seen = is_range_slice; - static constexpr bool is_valid = valid - && (have_all_seen - ? is_all_slice - : (!range_seen && (is_int_slice || is_range_slice))); - - static constexpr bool value = is_contiguous_view_impl < layout_type::row_major, is_valid, - have_all_seen, range_seen || is_range_slice, - xtl::mpl::pop_front_t < V >> ::value; + static constexpr bool is_valid = valid && (have_all_seen ? is_all_slice : (!range_seen && (is_int_slice || is_range_slice))); + + static constexpr bool value = is_contiguous_view_impl>::value; + static constexpr size_t contiguous_dimensions = is_contiguous_view_impl>::contiguous_dimensions + (value ? 1 :0); }; template struct is_contiguous_view_impl> { static constexpr bool value = valid; + static constexpr size_t contiguous_dimensions = 0; }; // For column major the *same* but reverse is true -- with the additional @@ -208,13 +217,12 @@ namespace xt static constexpr bool have_int_seen = int_seen || is_int_slice; - static constexpr bool is_valid = valid - && (have_int_seen - ? is_int_slice - : (!range_seen && (is_all_slice || is_range_slice))); - static constexpr bool value = is_contiguous_view_impl < layout_type::column_major, is_valid, - have_int_seen, is_range_slice || range_seen, - xtl::mpl::pop_front_t < V >> ::value; + static constexpr bool is_valid = valid && (have_int_seen ? is_int_slice : (!range_seen && (is_all_slice || is_range_slice))); + static constexpr bool value = is_contiguous_view_impl>::value; }; template @@ -226,14 +234,11 @@ namespace xt // TODO relax has_data_interface constraint here! template struct is_contiguous_view - : std::integral_constant< - bool, - has_data_interface::value - && !( - E::static_layout == layout_type::column_major - && static_cast(static_dimension::value) != sizeof...(S) - ) - && is_contiguous_view_impl>::value> + : std::integral_constant::value && + !(E::static_layout == layout_type::column_major && static_cast(static_dimension::value) != sizeof...(S)) && + is_contiguous_view_impl>::value + > { }; @@ -271,20 +276,21 @@ namespace xt struct get_contigous_shape_type { // if we have no `range` in the slices we can re-use the shape with an offset - using type = std::conditional_t< - xtl::disjunction...>::value, - typename xview_shape_type::type, - // In the false branch we know that we have only integers at the front OR end, and NO range - typename unwrap_offset_container()>::type>; + using type = std::conditional_t...>::value, + typename xview_shape_type::type, + // In the false branch we know that we have only integers at the front OR end, and NO range + typename unwrap_offset_container()>::type>; }; template - struct is_sequence_view : std::integral_constant + struct is_sequence_view + : std::integral_constant { }; template - struct is_sequence_view> : std::integral_constant + struct is_sequence_view> + : std::integral_constant { }; } @@ -298,16 +304,15 @@ namespace xt using size_type = typename xexpression_type::size_type; using temporary_type = view_temporary_type_t; - static constexpr layout_type layout = detail::is_contiguous_view::value - ? xexpression_type::static_layout - : layout_type::dynamic; + static constexpr layout_type layout = + detail::is_contiguous_view::value ? + xexpression_type::static_layout : layout_type::dynamic; static constexpr bool is_const = std::is_const>::value; - using extract_storage_type = xtl::mpl::eval_if_t< - has_data_interface, - detail::expr_storage_type, - make_invalid_type<>>; + using extract_storage_type = xtl::mpl::eval_if_t, + detail::expr_storage_type, + make_invalid_type<>>; using storage_type = std::conditional_t; }; @@ -319,20 +324,17 @@ namespace xt static constexpr bool is_strided_view = detail::is_strided_view::value; static constexpr bool is_contiguous_view = detail::is_contiguous_view::value; - using inner_shape_type = std::conditional_t< - is_contiguous_view, - typename detail::get_contigous_shape_type::type, - typename xview_shape_type::type>; + using inner_shape_type = std::conditional_t::type, + typename xview_shape_type::type>; - using stepper = std::conditional_t< - is_strided_view, - xstepper>, - xview_stepper>::value, CT, S...>>; + using stepper = std::conditional_t>, + xview_stepper>::value, CT, S...>>; - using const_stepper = std::conditional_t< - is_strided_view, - xstepper>, - xview_stepper, S...>>; + using const_stepper = std::conditional_t>, + xview_stepper, S...>>; }; /** @@ -352,9 +354,9 @@ namespace xt template class xview : public xview_semantic>, public std::conditional_t< - detail::is_contiguous_view, S...>::value, - xcontiguous_iterable>, - xiterable>>, + detail::is_contiguous_view, S...>::value, + xcontiguous_iterable>, + xiterable>>, public xaccessible>, public extension::xview_base_t { @@ -376,8 +378,9 @@ namespace xt using bool_load_type = typename xexpression_type::bool_load_type; using reference = typename inner_types::reference; using const_reference = typename inner_types::const_reference; - using pointer = std:: - conditional_t; + using pointer = std::conditional_t; using const_pointer = typename xexpression_type::const_pointer; using size_type = typename inner_types::size_type; using difference_type = typename xexpression_type::difference_type; @@ -392,35 +395,28 @@ namespace xt using inner_shape_type = typename iterable_base::inner_shape_type; using shape_type = typename xview_shape_type::type; - using xexpression_inner_strides_type = xtl::mpl::eval_if_t< - has_strides, - detail::expr_inner_strides_type, - get_strides_type>; + using xexpression_inner_strides_type = xtl::mpl::eval_if_t, + detail::expr_inner_strides_type, + get_strides_type>; - using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t< - has_strides, - detail::expr_inner_backstrides_type, - get_strides_type>; + using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t, + detail::expr_inner_backstrides_type, + get_strides_type>; using storage_type = typename inner_types::storage_type; - static constexpr bool has_trivial_strides = is_contiguous_view - && !xtl::disjunction...>::value; - using inner_strides_type = std::conditional_t< - has_trivial_strides, - typename detail::unwrap_offset_container< - xexpression_type::static_layout, - xexpression_inner_strides_type, - integral_count()>::type, - get_strides_t>; - - using inner_backstrides_type = std::conditional_t< - has_trivial_strides, - typename detail::unwrap_offset_container< - xexpression_type::static_layout, - xexpression_inner_backstrides_type, - integral_count()>::type, - get_strides_t>; + static constexpr bool has_trivial_strides = is_contiguous_view && !xtl::disjunction...>::value; + using inner_strides_type = std::conditional_t()>::type, + get_strides_t>; + + using inner_backstrides_type = std::conditional_t()>::type, + get_strides_t>; using strides_type = get_strides_t; using backstrides_type = strides_type; @@ -431,21 +427,21 @@ namespace xt using stepper = typename iterable_base::stepper; using const_stepper = typename iterable_base::const_stepper; - using linear_iterator = std::conditional_t< - has_data_interface::value && is_strided_view, - std::conditional_t, - typename iterable_base::linear_iterator>; - using const_linear_iterator = std::conditional_t< - has_data_interface::value && is_strided_view, - typename xexpression_type::const_linear_iterator, - typename iterable_base::const_linear_iterator>; + using linear_iterator = std::conditional_t::value && is_strided_view, + std::conditional_t, + typename iterable_base::linear_iterator>; + using const_linear_iterator = std::conditional_t::value && is_strided_view, + typename xexpression_type::const_linear_iterator, + typename iterable_base::const_linear_iterator>; using reverse_linear_iterator = std::reverse_iterator; using const_reverse_linear_iterator = std::reverse_iterator; using container_iterator = pointer; using const_container_iterator = const_pointer; - static constexpr std::size_t rank = SIZE_MAX; + constexpr static std::size_t rank = SIZE_MAX; // The FSL argument prevents the compiler from calling this constructor // instead of the copy constructor when sizeof...(SL) == 0. @@ -494,36 +490,48 @@ namespace xt bool has_linear_assign(const ST& strides) const; template - std::enable_if_t stepper_begin(const ST& shape); + std::enable_if_t + stepper_begin(const ST& shape); template - std::enable_if_t stepper_end(const ST& shape, layout_type l); + std::enable_if_t + stepper_end(const ST& shape, layout_type l); template - std::enable_if_t stepper_begin(const ST& shape) const; + std::enable_if_t + stepper_begin(const ST& shape) const; template - std::enable_if_t stepper_end(const ST& shape, layout_type l) const; + std::enable_if_t + stepper_end(const ST& shape, layout_type l) const; template - std::enable_if_t stepper_begin(const ST& shape); + std::enable_if_t + stepper_begin(const ST& shape); template - std::enable_if_t stepper_end(const ST& shape, layout_type l); + std::enable_if_t + stepper_end(const ST& shape, layout_type l); template - std::enable_if_t stepper_begin(const ST& shape) const; + std::enable_if_t + stepper_begin(const ST& shape) const; template - std::enable_if_t stepper_end(const ST& shape, layout_type l) const; + std::enable_if_t + stepper_end(const ST& shape, layout_type l) const; template - std::enable_if_t::value, storage_type&> storage(); + std::enable_if_t::value, storage_type&> + storage(); template - std::enable_if_t::value, const storage_type&> storage() const; + std::enable_if_t::value, const storage_type&> + storage() const; template - std::enable_if_t::value && is_strided_view, linear_iterator> linear_begin(); + std::enable_if_t::value && is_strided_view, linear_iterator> + linear_begin(); template - std::enable_if_t::value && is_strided_view, linear_iterator> linear_end(); + std::enable_if_t::value && is_strided_view, linear_iterator> + linear_end(); template std::enable_if_t::value && is_strided_view, const_linear_iterator> @@ -574,10 +582,12 @@ namespace xt backstrides() const; template - std::enable_if_t::value && is_strided_view, const_pointer> data() const; + std::enable_if_t::value && is_strided_view, const_pointer> + data() const; template - std::enable_if_t::value && is_strided_view, pointer> data(); + std::enable_if_t::value && is_strided_view, pointer> + data(); template std::enable_if_t::value && is_strided_view, std::size_t> @@ -610,13 +620,11 @@ namespace xt size_type underlying_size(size_type dim) const; xtl::xclosure_pointer operator&() &; - xtl::xclosure_pointer operator&() const&; + xtl::xclosure_pointer operator&() const &; xtl::xclosure_pointer operator&() &&; - template < - class E, - class T = xexpression_type, - class = std::enable_if_t::value && is_contiguous_view, int>> + template ::value && is_contiguous_view, int>> void assign_to(xexpression& e, bool force_resize) const; template @@ -638,11 +646,9 @@ namespace xt template enable_simd_interface store_simd(size_type i, const simd& e); - template < - class align, - class requested_type = value_type, - std::size_t N = xt_simd::simd_traits::size, - class T = xexpression_type> + template ::size, + class T = xexpression_type> enable_simd_interface> load_simd(size_type i) const; template @@ -783,7 +789,9 @@ namespace xt { public: - using view_type = std::conditional_t, xview>; + using view_type = std::conditional_t, + xview>; using substepper_type = get_stepper; using value_type = typename substepper_type::value_type; @@ -795,13 +803,8 @@ namespace xt using shape_type = typename substepper_type::shape_type; xview_stepper() = default; - xview_stepper( - view_type* view, - substepper_type it, - size_type offset, - bool end = false, - layout_type l = XTENSOR_DEFAULT_TRAVERSAL - ); + xview_stepper(view_type* view, substepper_type it, + size_type offset, bool end = false, layout_type l = XTENSOR_DEFAULT_TRAVERSAL); reference operator*() const; @@ -884,7 +887,7 @@ namespace xt std::forward(e), std::forward(first_slice), std::forward(slices)... - ) + ) { } @@ -892,26 +895,25 @@ namespace xt template template xview::xview(std::true_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept - : m_e(std::forward(e)) - , m_slices(std::forward(first_slice), std::forward(slices)...) - , m_shape(compute_shape(detail::is_sequence_view{})) - , m_strides(m_e.strides()) - , m_backstrides(m_e.backstrides()) - , m_data_offset(data_offset_impl(std::make_index_sequence())) - , m_strides_computed(true) + : m_e(std::forward(e)), + m_slices(std::forward(first_slice), std::forward(slices)...), + m_shape(compute_shape(detail::is_sequence_view{})), + m_strides(m_e.strides()), + m_backstrides(m_e.backstrides()), + m_data_offset(data_offset_impl(std::make_index_sequence())), + m_strides_computed(true) { } template template xview::xview(std::false_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept - : m_e(std::forward(e)) - , m_slices(std::forward(first_slice), std::forward(slices)...) - , m_shape(compute_shape(std::false_type{})) - , m_strides_computed(false) + : m_e(std::forward(e)), + m_slices(std::forward(first_slice), std::forward(slices)...), + m_shape(compute_shape(std::false_type{})), + m_strides_computed(false) { } - //@} template @@ -934,7 +936,6 @@ namespace xt { return semantic_base::operator=(e); } - //@} template @@ -973,36 +974,50 @@ namespace xt template inline layout_type xview::layout() const noexcept { - return xtl::mpl::static_if( - [&](auto self) + return xtl::mpl::static_if([&](auto self) + { + if (static_layout != layout_type::dynamic) { - if (static_layout != layout_type::dynamic) - { - return static_layout; - } - else - { - bool strides_match = do_strides_match( - self(this)->shape(), - self(this)->strides(), - self(this)->m_e.layout(), - true - ); - return strides_match ? self(this)->m_e.layout() : layout_type::dynamic; - } - }, - /* else */ - [&](auto /*self*/) + return static_layout; + } + else { - return layout_type::dynamic; + return self(this)->m_e.layout()==layout_type::row_major ? + ((self(this)->strides()[self(this)->strides().size()-1] == 1) ? self(this)->m_e.layout() : layout_type::dynamic) : + ((self(this)->strides()[0] == 1) ? self(this)->m_e.layout() : layout_type::dynamic); + + bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true); + return strides_match ? self(this)->m_e.layout() : layout_type::dynamic; } - ); + }, + /* else */ [&](auto /*self*/) + { + std::cout << "View is not strided!" << std::endl; + return layout_type::dynamic; + }); } template inline bool xview::is_contiguous() const noexcept { - return layout() != layout_type::dynamic; + return xtl::mpl::static_if([&](auto self) + { + if (static_layout != layout_type::dynamic) + { + return true; + } + else + { + bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true); + return strides_match ? self(this)->m_e.layout()!=layout_type::dynamic : false; + } + }, + /* else */ [&](auto /*self*/) + { + return false; + }); + +// return layout() != layout_type::dynamic; } //@} @@ -1020,17 +1035,13 @@ namespace xt template inline void xview::fill(const T& value) { - xtl::mpl::static_if( - [&](auto self) - { - std::fill(self(this)->linear_begin(), self(this)->linear_end(), value); - }, - /*else*/ - [&](auto self) - { - std::fill(self(this)->begin(), self(this)->end(), value); - } - ); + xtl::mpl::static_if([&](auto self) + { + std::fill(self(this)->linear_begin(), self(this)->linear_end(), value); + }, /*else*/ [&](auto self) + { + std::fill(self(this)->begin(), self(this)->end(), value); + }); } /** @@ -1076,6 +1087,7 @@ namespace xt return unchecked_impl(make_index_sequence(args...), static_cast(args)...); } + template template inline auto xview::element(It first, It last) -> reference @@ -1163,15 +1175,16 @@ namespace xt */ template template - inline auto xview::storage() -> std::enable_if_t::value, storage_type&> + inline auto xview::storage() -> + std::enable_if_t::value, storage_type&> { return m_e.storage(); } template template - inline auto xview::storage() const - -> std::enable_if_t::value, const storage_type&> + inline auto xview::storage() const -> + std::enable_if_t::value, const storage_type&> { return m_e.storage(); } @@ -1277,8 +1290,8 @@ namespace xt */ template template - inline auto xview::strides() const - -> std::enable_if_t::value && is_strided_view, const inner_strides_type&> + inline auto xview::strides() const -> + std::enable_if_t::value && is_strided_view, const inner_strides_type&> { if (!m_strides_computed) { @@ -1290,8 +1303,8 @@ namespace xt template template - inline auto xview::backstrides() const - -> std::enable_if_t::value && is_strided_view, const inner_strides_type&> + inline auto xview::backstrides() const -> + std::enable_if_t::value && is_strided_view, const inner_strides_type&> { if (!m_strides_computed) { @@ -1306,16 +1319,16 @@ namespace xt */ template template - inline auto xview::data() const - -> std::enable_if_t::value && is_strided_view, const_pointer> + inline auto xview::data() const -> + std::enable_if_t::value && is_strided_view, const_pointer> { return m_e.data(); } template template - inline auto xview::data() - -> std::enable_if_t::value && is_strided_view, pointer> + inline auto xview::data() -> + std::enable_if_t::value && is_strided_view, pointer> { return m_e.data(); } @@ -1324,9 +1337,9 @@ namespace xt template inline std::size_t xview::data_offset_impl(std::index_sequence) const noexcept { - auto temp = std::array( - {(static_cast(xt::value(std::get(m_slices), 0)))...} - ); + auto temp = std::array({ + (static_cast(xt::value(std::get(m_slices), 0)))... + }); std::ptrdiff_t result = 0; std::size_t i = 0; @@ -1346,8 +1359,8 @@ namespace xt */ template template - inline auto xview::data_offset() const noexcept - -> std::enable_if_t::value && is_strided_view, std::size_t> + inline auto xview::data_offset() const noexcept -> + std::enable_if_t::value && is_strided_view, std::size_t> { if (!m_strides_computed) { @@ -1356,7 +1369,6 @@ namespace xt } return m_data_offset; } - //@} template @@ -1372,7 +1384,7 @@ namespace xt } template - inline auto xview::operator&() const& -> xtl::xclosure_pointer + inline auto xview::operator&() const & -> xtl::xclosure_pointer { return xtl::closure_pointer(*this); } @@ -1409,20 +1421,15 @@ namespace xt template inline bool xview::has_linear_assign(const ST& str) const { - return xtl::mpl::static_if( - [&](auto self) - { - return str.size() == self(this)->strides().size() - && std::equal(str.cbegin(), str.cend(), self(this)->strides().begin()); - }, - /*else*/ - [](auto /*self*/) - { - return false; - } - ); - } + return xtl::mpl::static_if([&](auto self) + { + return str.size() == self(this)->strides().size() && + std::equal(str.cbegin(), str.cend(), self(this)->strides().begin()); + }, /*else*/ [](auto /*self*/){ + return false; + }); + } //@} template @@ -1458,8 +1465,7 @@ namespace xt } template - inline auto xview::data_xend(layout_type l, size_type offset) const noexcept - -> const_container_iterator + inline auto xview::data_xend(layout_type l, size_type offset) const noexcept -> const_container_iterator { return data_xend_impl(data() + data_offset(), l, offset); } @@ -1497,8 +1503,7 @@ namespace xt template template - inline auto xview::load_simd(size_type i) const - -> enable_simd_interface> + inline auto xview::load_simd(size_type i) const -> enable_simd_interface> { return m_e.template load_simd(data_offset() + i); } @@ -1537,10 +1542,9 @@ namespace xt template inline auto xview::make_index_sequence(Args...) const noexcept { - return std::make_index_sequence< - (sizeof...(Args) + integral_count() > newaxis_count() - ? sizeof...(Args) + integral_count() - newaxis_count() - : 0)>(); + return std::make_index_sequence<(sizeof...(Args)+integral_count() > newaxis_count() ? + sizeof...(Args)+integral_count() - newaxis_count() : + 0)>(); } template @@ -1548,12 +1552,11 @@ namespace xt inline auto xview::compute_strides_impl(std::index_sequence) const noexcept { std::size_t original_dim = m_e.dimension(); - return std::array( - {(static_cast(xt::step_size(std::get(I)>(m_slices), 1)) - * ((integral_skip(I) - newaxis_count_before(integral_skip(I))) < original_dim - ? m_e.strides()[integral_skip(I) - newaxis_count_before(integral_skip(I))] - : 1))...} - ); + return std::array({ + (static_cast(xt::step_size(std::get(I)>(m_slices), 1)) * + ((integral_skip(I) - newaxis_count_before(integral_skip(I))) < original_dim ? + m_e.strides()[integral_skip(I) - newaxis_count_before(integral_skip(I))] : 1))... + }); } template @@ -1629,8 +1632,7 @@ namespace xt template template ::size_type... I, class... Args> - inline auto xview::unchecked_impl(std::index_sequence, Args... args) const - -> const_reference + inline auto xview::unchecked_impl(std::index_sequence, Args... args) const -> const_reference { return m_e.unchecked(index(args...)...); } @@ -1651,19 +1653,15 @@ namespace xt template template ::size_type I, class... Args> - inline auto xview::index(Args... args) const - -> std::enable_if_t::value, size_type> + inline auto xview::index(Args... args) const -> std::enable_if_t::value, size_type> { - return sliced_access(I) + newaxis_count_before(I + 1)>( - std::get(I + 1)>(m_slices), - args... - ); + return sliced_access(I) + newaxis_count_before(I + 1)> + (std::get(I + 1)>(m_slices), args...); } template template ::size_type I, class... Args> - inline auto xview::index(Args... args) const - -> std::enable_if_t::value, size_type> + inline auto xview::index(Args... args) const -> std::enable_if_t::value, size_type> { return argument() + newaxis_count()>(args...); } @@ -1680,8 +1678,7 @@ namespace xt inline auto xview::sliced_access(const xslice& slice, Arg arg, Args... args) const -> size_type { using ST = typename T::size_type; - return static_cast(slice.derived_cast( - )(argument(static_cast(arg), static_cast(args)...))); + return static_cast(slice.derived_cast()(argument(static_cast(arg), static_cast(args)...))); } template @@ -1698,17 +1695,14 @@ namespace xt auto index = xtl::make_sequence(m_e.dimension(), 0); using diff_type = typename std::iterator_traits::difference_type; using ivalue_type = typename base_index_type::value_type; - auto func1 = [&first](const auto& s) noexcept - { + auto func1 = [&first](const auto& s) noexcept { return get_slice_value(s, first); }; - auto func2 = [](const auto& s) noexcept - { + auto func2 = [](const auto& s) noexcept { return xt::value(s, 0); }; - auto s = static_cast((std::min - )(static_cast(std::distance(first, last)), this->dimension())); + auto s = static_cast((std::min)(static_cast(std::distance(first, last)), this->dimension())); auto first_copy = last - s; for (size_type i = 0; i != m_e.dimension(); ++i) { @@ -1720,12 +1714,13 @@ namespace xt if (first < last) { - index[i] = k < sizeof...(S) ? apply(k, func1, m_slices) - : static_cast(*first); + index[i] = k < sizeof...(S) ? + apply(k, func1, m_slices) : static_cast(*first); } else { - index[i] = k < sizeof...(S) ? apply(k, func2, m_slices) : ivalue_type(0); + index[i] = k < sizeof...(S) ? + apply(k, func2, m_slices) : ivalue_type(0); } } return index; @@ -1742,15 +1737,12 @@ namespace xt { std::size_t dim = m_e.dimension() - integral_count() + newaxis_count(); auto shape = xtl::make_sequence(dim, 0); - auto func = [](const auto& s) noexcept - { - return get_size(s); - }; + auto func = [](const auto& s) noexcept { return get_size(s); }; for (size_type i = 0; i != dim; ++i) { size_type index = integral_skip(i); - shape[i] = index < sizeof...(S) ? apply(index, func, m_slices) - : m_e.shape()[index - newaxis_count_before(index)]; + shape[i] = index < sizeof...(S) ? + apply(index, func, m_slices) : m_e.shape()[index - newaxis_count_before(index)]; } return shape; } @@ -1764,8 +1756,7 @@ namespace xt } template - inline void - run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */) + inline void run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */) { std::copy(t.cbegin(), t.cend(), v.begin()); } @@ -1774,8 +1765,8 @@ namespace xt template inline void xview::assign_temporary_impl(temporary_type&& tmp) { - constexpr bool fast_assign = detail::is_strided_view::value - && xassign_traits, temporary_type>::simd_strided_assign(); + constexpr bool fast_assign = detail::is_strided_view::value && \ + xassign_traits, temporary_type>::simd_strided_assign(); xview_detail::run_assign_temporary_impl(*this, tmp, std::integral_constant{}); } @@ -1808,13 +1799,8 @@ namespace xt { // Checks that no ellipsis slice is used using view_type = xview, get_slice_type, S>...>; - return view_type( - std::forward(e), - get_slice_implementation( - e, - std::forward(slices), - get_underlying_shape_index, S...>(I) - )... + return view_type(std::forward(e), + get_slice_implementation(e, std::forward(slices), get_underlying_shape_index, S...>(I))... ); } } @@ -1831,11 +1817,7 @@ namespace xt template inline auto view(E&& e, S&&... slices) { - return detail::make_view_impl( - std::forward(e), - std::make_index_sequence(), - std::forward(slices)... - ); + return detail::make_view_impl(std::forward(e), std::make_index_sequence(), std::forward(slices)...); } namespace detail @@ -1843,9 +1825,8 @@ namespace xt class row_impl { public: - - template - inline static auto make(E&& e, const std::ptrdiff_t index) + template + static inline auto make(E&& e, const std::ptrdiff_t index) { const auto shape = e.shape(); check_dimension(shape); @@ -1853,21 +1834,17 @@ namespace xt } private: - - template - inline static void check_dimension(const S& shape) + template + static inline void check_dimension(const S& shape) { if (shape.size() != 2) { - XTENSOR_THROW( - std::invalid_argument, - "A row can only be accessed on an expression with exact two dimensions" - ); + XTENSOR_THROW(std::invalid_argument, "A row can only be accessed on an expression with exact two dimensions"); } } - template - inline static void check_dimension(const std::array&) + template + static inline void check_dimension(const std::array&) { static_assert(N == 2, "A row can only be accessed on an expression with exact two dimensions"); } @@ -1876,9 +1853,8 @@ namespace xt class column_impl { public: - - template - inline static auto make(E&& e, const std::ptrdiff_t index) + template + static inline auto make(E&& e, const std::ptrdiff_t index) { const auto shape = e.shape(); check_dimension(shape); @@ -1886,21 +1862,17 @@ namespace xt } private: - - template - inline static void check_dimension(const S& shape) + template + static inline void check_dimension(const S& shape) { if (shape.size() != 2) { - XTENSOR_THROW( - std::invalid_argument, - "A column can only be accessed on an expression with exact two dimensions" - ); + XTENSOR_THROW(std::invalid_argument, "A column can only be accessed on an expression with exact two dimensions"); } } - template - inline static void check_dimension(const std::array&) + template + static inline void check_dimension(const std::array&) { static_assert(N == 2, "A column can only be accessed on an expression with exact two dimensions"); } @@ -1951,8 +1923,7 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) - -> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return stepper(this, m_e.stepper_end(m_e.shape(), l), offset, true, l); @@ -1960,8 +1931,7 @@ namespace xt template template - inline auto xview::stepper_begin(const ST& shape) const - -> std::enable_if_t + inline auto xview::stepper_begin(const ST& shape) const -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); const xexpression_type& e = m_e; @@ -1970,8 +1940,7 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) const - -> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) const -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); const xexpression_type& e = m_e; @@ -1988,8 +1957,7 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) - -> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return stepper(this, data_xend(l, offset), offset); @@ -1997,8 +1965,7 @@ namespace xt template template - inline auto xview::stepper_begin(const ST& shape) const - -> std::enable_if_t + inline auto xview::stepper_begin(const ST& shape) const -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return const_stepper(this, data_xbegin(), offset); @@ -2006,8 +1973,7 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) const - -> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) const-> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return const_stepper(this, data_xend(l, offset), offset); @@ -2018,24 +1984,14 @@ namespace xt ********************************/ template - inline xview_stepper::xview_stepper( - view_type* view, - substepper_type it, - size_type offset, - bool end, - layout_type l - ) - : p_view(view) - , m_it(it) - , m_offset(offset) + inline xview_stepper::xview_stepper(view_type* view, substepper_type it, + size_type offset, bool end, layout_type l) + : p_view(view), m_it(it), m_offset(offset) { if (!end) { std::fill(m_index_keeper.begin(), m_index_keeper.end(), 0); - auto func = [](const auto& s) noexcept - { - return xt::value(s, 0); - }; + auto func = [](const auto& s) noexcept { return xt::value(s, 0); }; for (size_type i = 0; i < sizeof...(S); ++i) { if (!is_newaxis_slice(i)) @@ -2061,18 +2017,14 @@ namespace xt template inline void xview_stepper::step(size_type dim) { - auto func = [this](size_type index, size_type offset) - { - m_it.step(index, offset); - }; + auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); }; common_step_forward(dim, func); } template inline void xview_stepper::step_back(size_type dim) { - auto func = [this](size_type index, size_type offset) - { + auto func = [this](size_type index, size_type offset) { m_it.step_back(index, offset); }; common_step_backward(dim, func); @@ -2081,18 +2033,14 @@ namespace xt template inline void xview_stepper::step(size_type dim, size_type n) { - auto func = [this](size_type index, size_type offset) - { - m_it.step(index, offset); - }; + auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); }; common_step_forward(dim, n, func); } template inline void xview_stepper::step_back(size_type dim, size_type n) { - auto func = [this](size_type index, size_type offset) - { + auto func = [this](size_type index, size_type offset) { m_it.step_back(index, offset); }; common_step_backward(dim, n, func); @@ -2101,20 +2049,14 @@ namespace xt template inline void xview_stepper::reset(size_type dim) { - auto func = [this](size_type index, size_type offset) - { - m_it.step_back(index, offset); - }; + auto func = [this](size_type index, size_type offset) { m_it.step_back(index, offset); }; common_reset(dim, func, false); } template inline void xview_stepper::reset_back(size_type dim) { - auto func = [this](size_type index, size_type offset) - { - m_it.step(index, offset); - }; + auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); }; common_reset(dim, func, true); } @@ -2142,12 +2084,10 @@ namespace xt template inline void xview_stepper::to_end_impl(layout_type l) { - auto func = [](const auto& s) noexcept - { + auto func = [](const auto& s) noexcept { return xt::value(s, get_size(s) - 1); }; - auto size_func = [](const auto& s) noexcept - { + auto size_func = [](const auto& s) noexcept { return get_size(s); }; @@ -2197,15 +2137,14 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, this](const auto& s) noexcept - { + auto func = [&dim, this](const auto& s) noexcept { return step_size(s, this->m_index_keeper[dim]++, 1); }; size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) - : 1; + size_type step_size = index < sizeof...(S) ? + apply(index, func, p_view->slices()) : 1; index -= newaxis_count_before(index); f(index, step_size); } @@ -2218,8 +2157,7 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, &n, this](const auto& s) noexcept - { + auto func = [&dim, &n, this](const auto& s) noexcept { auto st_size = step_size(s, this->m_index_keeper[dim], n); this->m_index_keeper[dim] += n; return size_type(st_size); @@ -2228,8 +2166,8 @@ namespace xt size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) - : n; + size_type step_size = index < sizeof...(S) ? + apply(index, func, p_view->slices()) : n; index -= newaxis_count_before(index); f(index, step_size); } @@ -2242,16 +2180,15 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, this](const auto& s) noexcept - { + auto func = [&dim, this](const auto& s) noexcept { this->m_index_keeper[dim]--; return step_size(s, this->m_index_keeper[dim], 1); }; size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) - : 1; + size_type step_size = index < sizeof...(S) ? + apply(index, func, p_view->slices()) : 1; index -= newaxis_count_before(index); f(index, step_size); } @@ -2264,8 +2201,7 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, &n, this](const auto& s) noexcept - { + auto func = [&dim, &n, this](const auto& s) noexcept { this->m_index_keeper[dim] -= n; return step_size(s, this->m_index_keeper[dim], n); }; @@ -2273,8 +2209,8 @@ namespace xt size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) - : n; + size_type step_size = index < sizeof...(S) ? + apply(index, func, p_view->slices()) : n; index -= newaxis_count_before(index); f(index, step_size); } @@ -2285,27 +2221,19 @@ namespace xt template void xview_stepper::common_reset(size_type dim, F f, bool backwards) { - auto size_func = [](const auto& s) noexcept - { - return get_size(s); - }; - auto end_func = [](const auto& s) noexcept - { - return xt::value(s, get_size(s) - 1) - xt::value(s, 0); - }; + auto size_func = [](const auto& s) noexcept { return get_size(s); }; + auto end_func = [](const auto& s) noexcept { return xt::value(s, get_size(s) - 1) - xt::value(s, 0); }; size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { if (dim < m_index_keeper.size()) { - size_type size = index < sizeof...(S) ? apply(index, size_func, p_view->slices()) - : p_view->shape()[dim]; + size_type size = index < sizeof...(S) ? apply(index, size_func, p_view->slices()) : p_view->shape()[dim]; m_index_keeper[dim] = backwards ? size - 1 : 0; } - size_type reset_n = index < sizeof...(S) ? apply(index, end_func, p_view->slices()) - : p_view->shape()[dim] - 1; + size_type reset_n = index < sizeof...(S) ? apply(index, end_func, p_view->slices()) : p_view->shape()[dim] - 1; index -= newaxis_count_before(index); f(index, reset_n); } From 643b6016dca9019525ad6178305d89f004507d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Wed, 30 Nov 2022 15:54:38 +0100 Subject: [PATCH 03/33] Fix openMP parallel linear assign --- include/xtensor/xassign.hpp | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 7c67da273..5ce72a6e5 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -657,22 +657,22 @@ namespace xt } } #elif defined(XTENSOR_USE_OPENMP) - if (size >= XTENSOR_OPENMP_TRESHOLD) + if (size >= size_typeXTENSOR_OPENMP_TRESHOLD)) { - #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) - #ifndef _WIN32 +#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) +#ifndef _WIN32 for (size_type i = align_begin; i < align_end; i += simd_size) { e1.template store_simd(i, e2.template load_simd(i)); } - #else +#else for(auto i = static_cast(align_begin); i < static_cast(align_end); i += static_cast(simd_size)) { size_type ui = static_cast(i); e1.template store_simd(ui, e2.template load_simd(ui)); } - #endif +#endif } else { @@ -719,10 +719,21 @@ namespace xt *(dst + i) = static_cast(*(src + i)); }); #elif defined(XTENSOR_USE_OPENMP) - #pragma omp parallel for default(none) shared(src, dst, n) - for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) + if (n >= XTENSOR_OPENMP_TRESHOLD) { - *(dst + i) = static_cast(*(src + i)); +#pragma omp parallel for default(none) shared(src, dst, n) + for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) + { + *(dst + i) = static_cast(*(src + i)); + } + } + else { + for (; n > size_type(0); --n) + { + *dst = static_cast(*src); + ++src; + ++dst; + } } #else for (; n > size_type(0); --n) @@ -1022,7 +1033,7 @@ namespace xt } // std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <50) { + if (outer_loop_size>20) { std::size_t first_step = true; #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) for (std::size_t ox = 0; ox < outer_loop_size; ++ox) From 8e75d92156308c7b67cc586c362dc481b79dc239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Wed, 30 Nov 2022 18:02:08 +0100 Subject: [PATCH 04/33] Fix typo on xassign.hpp --- include/xtensor/xassign.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 5ce72a6e5..dd8d79713 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -657,7 +657,7 @@ namespace xt } } #elif defined(XTENSOR_USE_OPENMP) - if (size >= size_typeXTENSOR_OPENMP_TRESHOLD)) + if (size >= size_type(XTENSOR_OPENMP_TRESHOLD)) { #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) #ifndef _WIN32 From 63381a8dba4678758f79b8b354db6aac30059617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Thu, 4 Feb 2021 13:09:55 +0100 Subject: [PATCH 05/33] initial thoughts parallel strided assign --- include/xtensor/xassign.hpp | 316 +++++++++++++++++++++++------------- 1 file changed, 205 insertions(+), 111 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index dd8d79713..fbf2dfcfd 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -247,7 +246,7 @@ namespace xt inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t::value, bool> { return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout()) || - (e1.is_contiguous() && e2.has_linear_assign(e1.strides())); + (e1.layout() != layout_type::dynamic && e2.has_linear_assign(e1.strides())); } template @@ -343,7 +342,7 @@ namespace xt template static constexpr bool simd_size_impl() { return xt_simd::simd_traits::size > 1 || (is_bool::value && use_xsimd()); } static constexpr bool simd_size() { return simd_size_impl() && simd_size_impl(); } - static constexpr bool simd_interface() { return has_simd_interface() && + static constexpr bool simd_interface() { return has_simd_interface() && has_simd_interface(); } public: @@ -395,7 +394,7 @@ namespace xt linear_assigner::run(de1, de2); } } - else if (simd_strided_assign && de1.layout() == de2.layout()) + else if (simd_strided_assign) { strided_loop_assigner::run(de1, de2); } @@ -419,20 +418,16 @@ namespace xt inline void xexpression_assigner::computed_assign(xexpression& e1, const xexpression& e2) { using shape_type = typename E1::shape_type; - using comperator_type = std::greater; - using size_type = typename E1::size_type; E1& de1 = e1.derived_cast(); const E2& de2 = e2.derived_cast(); - size_type dim2 = de2.dimension(); - shape_type shape = uninitialized_shape(dim2); - + size_type dim = de2.dimension(); + shape_type shape = uninitialized_shape(dim); bool trivial_broadcast = de2.broadcast_shape(shape, true); - auto && de1_shape = de1.shape(); - if (dim2 > de1.dimension() || std::lexicographical_compare(shape.begin(), shape.end(), de1_shape.begin(), de1_shape.end(), comperator_type())) + if (dim > de1.dimension() || shape > de1.shape()) { typename E1::temporary_type tmp(shape); base_type::assign_data(tmp, e2, trivial_broadcast); @@ -620,7 +615,6 @@ namespace xt template inline void linear_assigner::run(E1& e1, const E2& e2) { -// std::cout << "lin" << std::endl; using lhs_align_mode = xt_simd::container_alignment_t; constexpr bool is_aligned = std::is_same::value; using rhs_align_mode = std::conditional_t; @@ -642,37 +636,27 @@ namespace xt } #if defined(XTENSOR_USE_TBB) - if (size >= XTENSOR_TBB_THRESHOLD) - { - tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i) - { - e1.template store_simd(i, e2.template load_simd(i)); - }); - } - else + tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i) { - for (size_type i = align_begin; i < align_end; i += simd_size) - { - e1.template store_simd(i, e2.template load_simd(i)); - } - } + e1.template store_simd(i, e2.template load_simd(i)); + }); #elif defined(XTENSOR_USE_OPENMP) - if (size >= size_type(XTENSOR_OPENMP_TRESHOLD)) + if (size >= XTENSOR_OPENMP_TRESHOLD) { -#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) -#ifndef _WIN32 + #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) + #ifndef _WIN32 for (size_type i = align_begin; i < align_end; i += simd_size) { e1.template store_simd(i, e2.template load_simd(i)); } -#else + #else for(auto i = static_cast(align_begin); i < static_cast(align_end); i += static_cast(simd_size)) { size_type ui = static_cast(i); e1.template store_simd(ui, e2.template load_simd(ui)); } -#endif + #endif } else { @@ -712,28 +696,17 @@ namespace xt auto src = linear_begin(e2); auto dst = linear_begin(e1); size_type n = e1.size(); -// std::cout << "Linear assigner LOOP SIZE is " << n <(n), [&](std::ptrdiff_t i) { *(dst + i) = static_cast(*(src + i)); }); #elif defined(XTENSOR_USE_OPENMP) - if (n >= XTENSOR_OPENMP_TRESHOLD) + #pragma omp parallel for default(none) shared(src, dst, n) + for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) { -#pragma omp parallel for default(none) shared(src, dst, n) - for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) - { - *(dst + i) = static_cast(*(src + i)); - } - } - else { - for (; n > size_type(0); --n) - { - *dst = static_cast(*src); - ++src; - ++dst; - } + *(dst + i) = static_cast(*(src + i)); } #else for (; n > size_type(0); --n) @@ -783,29 +756,75 @@ namespace xt } } + template + static void nth_next_idx(size_t n, T&outer_index, T& outer_shape) + { + auto i = outer_index.size()-1; + auto stride_sizes = full_like(outer_shape,0); + stride_sizes[i] = 1; + // fill up dimensions in the outer_idx as required + for (; i >= 0; i--) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i]-1)) { + outer_index[i] += d_idx; + n-=d_idx * stride_sizes[i]; + // we filled up all dimensions required + break; + } + else if (outer_index[i] !=0) { + outer_index[i] = 0; + n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; + if (i>0) { + outer_index[i-1]++; + // compute stride sizes on the fly ... + stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; + } + } + else { + if (i>0) { + // compute stride sizes on the fly ... + stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; + } + } + } + i++; + // add remaining iterations + for (; i < outer_index.size(); i++) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i])) { + outer_index[i] += d_idx; + n-=d_idx * stride_sizes[i]; + } + else { + // that may not happen + } + } + } + template - static void nth_idx(size_t n, T& outer_index, const T& outer_shape) + static void nth_idx(size_t n, T& outer_index, T& outer_shape) { + // assume outer_index is filled with 0 + auto stride_sizes = full_like(outer_shape,0); - dynamic_shape stride_sizes; - xt::resize_container(stride_sizes, outer_shape.size()); + auto i = outer_index.size()-1; + stride_sizes[i] = 1; + i-- // compute strides - using size_type = typename T::size_type; - for ( size_type i = outer_shape.size(); i>0; i--) { - stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i]; - } + for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; } // compute index - for (size_type i=0; i < outer_shape.size(); i++) + for (i=0; i < outer_index.size(); i++) { auto d_idx = n/stride_sizes[i]; if (d_idx < outer_shape[i]) { - outer_index[i] = d_idx; + outer_index[i] += d_idx; n -= d_idx * stride_sizes[i]; } else { // that may not happen - outer_index[i] = 0; } } } @@ -835,27 +854,80 @@ namespace xt } } + template + static void nth_next_idx(size_t n, T& outer_index, T& outer_shape) + { + using size_type = typename T::size_type; + size_type i = 0; + auto stride_sizes = outer_shape; + stride_sizes[i] = 1; + // fill up dimensions in the outer_idx as required + for (; i < outer_index.size(); i++) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i])) { + outer_index[i] += d_idx; + n-=d_idx * stride_sizes[i]; + // we filled up all dimensions required + break; + } + else if (outer_index[i] != 0) { + outer_index[i] = 0; + n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; + + if (i+1= 0; i--) + { + auto d_idx = n/stride_sizes[i]; + if (d_idx<(outer_shape[i]-outer_index[i]-1)) { + outer_index[i] += d_idx; + n -= d_idx * stride_sizes[i]; + } + else { + // that may not happen + } + } + } + template - static void nth_idx(size_t n, T& outer_index, const T& outer_shape) + static void nth_idx(size_t n, T& outer_index, T& outer_shape) { - dynamic_shape stride_sizes; - xt::resize_container(stride_sizes, outer_shape.size()); - + // assume outer_index is filled with 0 using size_type = typename T::size_type; + auto stride_sizes = outer_shape; + size_type i = 0; + stride_sizes[i] = 1; + i++; // compute required strides - for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; } + for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; } // compute index - for (size_type i = outer_shape.size(); i>0;) - { - i--; + i = outer_shape.size()-1; + for (; i >= 0; i--) + { auto d_idx = n/stride_sizes[i]; if (d_idx<(outer_shape[i]-1)) { - outer_index[i] = d_idx; + outer_index[i] += d_idx; n -= d_idx * stride_sizes[i]; } -// else {} // that may not happen + else { + // that may not happen + } } } }; @@ -924,13 +996,11 @@ namespace xt { auto csf = check_strides_functor(e1.strides()); cut = csf(e2); - if (cut(e1.strides()); cut = csf(e2); - if (cut>1) cut = 1; } // can't reach here because this would have already triggered the fallback using shape_value_type = typename E1::shape_type::value_type; @@ -950,6 +1020,7 @@ namespace xt } } +#define strided_row_major_fallback #define strided_parallel_assign template template @@ -957,10 +1028,10 @@ namespace xt { bool is_row_major = true; using fallback_assigner = stepper_assigner; + if (E1::static_layout == layout_type::dynamic) { layout_type dynamic_layout = e1.layout(); - switch (dynamic_layout) { case layout_type::row_major: @@ -970,7 +1041,11 @@ namespace xt is_row_major = false; break; default: +#ifdef strided_row_major_fallback + is_row_major = true; +#else return fallback_assigner(e1, e2).run(); +#endif } } else if (E1::static_layout == layout_type::row_major) @@ -1031,55 +1106,75 @@ namespace xt { step_dim = cut; } -// std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <20) { - std::size_t first_step = true; + static bool reported = false; + if (!reported) { + std::cout << "SIMD size is " << simd_type::size << " step_dim " << step_dim << std::endl; + reported = true; + } + std::cout << "+"; + std::size_t first_step = true; +// #define old_parallel #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) - for (std::size_t ox = 0; ox < outer_loop_size; ++ox) - { - if (first_step) { - is_row_major ? - strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape) : - strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape); + for (std::size_t ox = 0; ox < outer_loop_size; ++ox) + { + if (first_step) { + is_row_major ? + strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape) : + strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + first_step = false; + } + else { + // next unaligned index + if (is_row_major) { + auto i = max_shape.size()-1; +// fct_stepper.step_back(i+1,inner_loop_size); +// res_stepper.step_back(i+1,inner_loop_size); + fct_stepper.reset(i+1); + res_stepper.reset(i+1); + fct_stepper.step_back(i+1,1); + res_stepper.step_back(i+1,1); - for (std::size_t i = 0; i < idx.size(); ++i) + + for (; i >= 0; --i) { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); + if (idx[i] + 1 >= max_shape[i]) + { + idx[i] = 0; + fct_stepper.reset(i); + res_stepper.reset(i); + } + else + { + idx[i]++; + fct_stepper.step(i, 1); + res_stepper.step(i, 1); + break; + } } - first_step = false; - } - - for (std::size_t i = 0; i < simd_size; ++i) - { - res_stepper.template store_simd(fct_stepper.template step_simd()); } - for (std::size_t i = 0; i < simd_rest; ++i) - { - *(res_stepper) = conditional_cast(*(fct_stepper)); - res_stepper.step_leading(); - fct_stepper.step_leading(); - } - - // next unaligned index - is_row_major ? - strided_assign_detail::idx_tools::next_idx(idx, max_shape) : - strided_assign_detail::idx_tools::next_idx(idx, max_shape); - - // need to step E1 as well if not contigous assign (e.g. view) - fct_stepper.to_begin(); - res_stepper.to_begin(); - for (std::size_t i = 0; i < idx.size(); ++i) - { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); + else { + std::cout << "oops" << std::endl; } +// std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<(fct_stepper.template step_simd()); + } + for (std::size_t i = 0; i < simd_rest; ++i) + { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } + } +#else for (std::size_t ox = 0; ox < outer_loop_size; ++ox) { for (std::size_t i = 0; i < simd_size; ++i) @@ -1117,8 +1212,7 @@ namespace xt } } } -#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign) - } + #endif } From 01e156e94ede732a5f43229991b24d379eb1c54f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Thu, 27 May 2021 23:30:24 +0200 Subject: [PATCH 06/33] trigger strided assign properly --- include/xtensor/xassign.hpp | 260 ++++++++++-------------------------- 1 file changed, 70 insertions(+), 190 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index fbf2dfcfd..99797bcb7 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -246,7 +246,7 @@ namespace xt inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t::value, bool> { return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout()) || - (e1.layout() != layout_type::dynamic && e2.has_linear_assign(e1.strides())); + (e1.is_contiguous() && e2.has_linear_assign(e1.strides())); } template @@ -394,7 +394,7 @@ namespace xt linear_assigner::run(de1, de2); } } - else if (simd_strided_assign) + else if (simd_strided_assign && de1.layout() == de2.layout()) { strided_loop_assigner::run(de1, de2); } @@ -615,6 +615,7 @@ namespace xt template inline void linear_assigner::run(E1& e1, const E2& e2) { +// std::cout << "lin" << std::endl; using lhs_align_mode = xt_simd::container_alignment_t; constexpr bool is_aligned = std::is_same::value; using rhs_align_mode = std::conditional_t; @@ -696,7 +697,7 @@ namespace xt auto src = linear_begin(e2); auto dst = linear_begin(e1); size_type n = e1.size(); - +// std::cout << "Linear assigner LOOP SIZE is " << n <(n), [&](std::ptrdiff_t i) { @@ -756,75 +757,29 @@ namespace xt } } - template - static void nth_next_idx(size_t n, T&outer_index, T& outer_shape) - { - auto i = outer_index.size()-1; - auto stride_sizes = full_like(outer_shape,0); - stride_sizes[i] = 1; - // fill up dimensions in the outer_idx as required - for (; i >= 0; i--) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i]-1)) { - outer_index[i] += d_idx; - n-=d_idx * stride_sizes[i]; - // we filled up all dimensions required - break; - } - else if (outer_index[i] !=0) { - outer_index[i] = 0; - n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; - if (i>0) { - outer_index[i-1]++; - // compute stride sizes on the fly ... - stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; - } - } - else { - if (i>0) { - // compute stride sizes on the fly ... - stride_sizes[i-1] = stride_sizes[i] * outer_shape[i]; - } - } - } - i++; - // add remaining iterations - for (; i < outer_index.size(); i++) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i])) { - outer_index[i] += d_idx; - n-=d_idx * stride_sizes[i]; - } - else { - // that may not happen - } - } - } - template - static void nth_idx(size_t n, T& outer_index, T& outer_shape) + static void nth_idx(size_t n, T& outer_index, const T& outer_shape) { - // assume outer_index is filled with 0 - auto stride_sizes = full_like(outer_shape,0); - auto i = outer_index.size()-1; - stride_sizes[i] = 1; - i-- + dynamic_shape stride_sizes; + xt::resize_container(stride_sizes, outer_shape.size()); // compute strides - for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; } + using size_type = typename T::size_type; + for ( size_type i = outer_shape.size(); i>0; i--) { + stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i]; + } // compute index - for (i=0; i < outer_index.size(); i++) + for (size_type i=0; i < outer_shape.size(); i++) { auto d_idx = n/stride_sizes[i]; if (d_idx < outer_shape[i]) { - outer_index[i] += d_idx; + outer_index[i] = d_idx; n -= d_idx * stride_sizes[i]; } else { // that may not happen + outer_index[i] = 0; } } } @@ -854,80 +809,27 @@ namespace xt } } - template - static void nth_next_idx(size_t n, T& outer_index, T& outer_shape) - { - using size_type = typename T::size_type; - size_type i = 0; - auto stride_sizes = outer_shape; - stride_sizes[i] = 1; - // fill up dimensions in the outer_idx as required - for (; i < outer_index.size(); i++) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i])) { - outer_index[i] += d_idx; - n-=d_idx * stride_sizes[i]; - // we filled up all dimensions required - break; - } - else if (outer_index[i] != 0) { - outer_index[i] = 0; - n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i]; - - if (i+1= 0; i--) - { - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-outer_index[i]-1)) { - outer_index[i] += d_idx; - n -= d_idx * stride_sizes[i]; - } - else { - // that may not happen - } - } - } - template - static void nth_idx(size_t n, T& outer_index, T& outer_shape) + static void nth_idx(size_t n, T& outer_index, const T& outer_shape) { - // assume outer_index is filled with 0 + dynamic_shape stride_sizes; + xt::resize_container(stride_sizes, outer_shape.size()); + using size_type = typename T::size_type; - auto stride_sizes = outer_shape; - size_type i = 0; - stride_sizes[i] = 1; - i++; // compute required strides - for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; } + for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; } // compute index - i = outer_shape.size()-1; - for (; i >= 0; i--) - { + for (size_type i = outer_shape.size(); i>0;) + { + i--; auto d_idx = n/stride_sizes[i]; if (d_idx<(outer_shape[i]-1)) { - outer_index[i] += d_idx; + outer_index[i] = d_idx; n -= d_idx * stride_sizes[i]; } - else { - // that may not happen - } +// else {} // that may not happen } } }; @@ -996,11 +898,13 @@ namespace xt { auto csf = check_strides_functor(e1.strides()); cut = csf(e2); + if (cut(e1.strides()); cut = csf(e2); + if (cut>1) cut = 1; } // can't reach here because this would have already triggered the fallback using shape_value_type = typename E1::shape_type::value_type; @@ -1020,7 +924,6 @@ namespace xt } } -#define strided_row_major_fallback #define strided_parallel_assign template template @@ -1028,10 +931,10 @@ namespace xt { bool is_row_major = true; using fallback_assigner = stepper_assigner; - if (E1::static_layout == layout_type::dynamic) { layout_type dynamic_layout = e1.layout(); + switch (dynamic_layout) { case layout_type::row_major: @@ -1041,11 +944,7 @@ namespace xt is_row_major = false; break; default: -#ifdef strided_row_major_fallback - is_row_major = true; -#else return fallback_assigner(e1, e2).run(); -#endif } } else if (E1::static_layout == layout_type::row_major) @@ -1106,75 +1005,55 @@ namespace xt { step_dim = cut; } +// std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <50) { + std::size_t first_step = true; #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) - for (std::size_t ox = 0; ox < outer_loop_size; ++ox) - { - if (first_step) { - is_row_major ? - strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape) : - strided_assign_detail::idx_tools::nth_next_idx(ox,idx, max_shape); - for (std::size_t i = 0; i < idx.size(); ++i) - { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); - } - first_step = false; - } - else { - // next unaligned index - if (is_row_major) { - auto i = max_shape.size()-1; -// fct_stepper.step_back(i+1,inner_loop_size); -// res_stepper.step_back(i+1,inner_loop_size); - fct_stepper.reset(i+1); - res_stepper.reset(i+1); - fct_stepper.step_back(i+1,1); - res_stepper.step_back(i+1,1); + for (std::size_t ox = 0; ox < outer_loop_size; ++ox) + { + if (first_step) { + is_row_major ? + strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape) : + strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape); - - for (; i >= 0; --i) + for (std::size_t i = 0; i < idx.size(); ++i) { - if (idx[i] + 1 >= max_shape[i]) - { - idx[i] = 0; - fct_stepper.reset(i); - res_stepper.reset(i); - } - else - { - idx[i]++; - fct_stepper.step(i, 1); - res_stepper.step(i, 1); - break; - } + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); } + first_step = false; } - else { - std::cout << "oops" << std::endl; + + for (std::size_t i = 0; i < simd_size; ++i) + { + res_stepper.template store_simd(fct_stepper.template step_simd()); + } + for (std::size_t i = 0; i < simd_rest; ++i) + { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } + + // next unaligned index + is_row_major ? + strided_assign_detail::idx_tools::next_idx(idx, max_shape) : + strided_assign_detail::idx_tools::next_idx(idx, max_shape); + + // need to step E1 as well if not contigous assign (e.g. view) + fct_stepper.to_begin(); + res_stepper.to_begin(); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); } -// std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<(fct_stepper.template step_simd()); - } - for (std::size_t i = 0; i < simd_rest; ++i) - { - *(res_stepper) = conditional_cast(*(fct_stepper)); - res_stepper.step_leading(); - fct_stepper.step_leading(); } - } -#else + } + else { + +#endif for (std::size_t ox = 0; ox < outer_loop_size; ++ox) { for (std::size_t i = 0; i < simd_size; ++i) @@ -1212,7 +1091,8 @@ namespace xt } } } - +#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign) + } #endif } From da29eda99c086a83dbedba35a7d10ac4011137ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Wed, 30 Nov 2022 15:54:38 +0100 Subject: [PATCH 07/33] Fix openMP parallel linear assign --- include/xtensor/xassign.hpp | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 99797bcb7..3e1f0af9e 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -642,22 +642,22 @@ namespace xt e1.template store_simd(i, e2.template load_simd(i)); }); #elif defined(XTENSOR_USE_OPENMP) - if (size >= XTENSOR_OPENMP_TRESHOLD) + if (size >= size_typeXTENSOR_OPENMP_TRESHOLD)) { - #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) - #ifndef _WIN32 +#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) +#ifndef _WIN32 for (size_type i = align_begin; i < align_end; i += simd_size) { e1.template store_simd(i, e2.template load_simd(i)); } - #else +#else for(auto i = static_cast(align_begin); i < static_cast(align_end); i += static_cast(simd_size)) { size_type ui = static_cast(i); e1.template store_simd(ui, e2.template load_simd(ui)); } - #endif +#endif } else { @@ -704,10 +704,21 @@ namespace xt *(dst + i) = static_cast(*(src + i)); }); #elif defined(XTENSOR_USE_OPENMP) - #pragma omp parallel for default(none) shared(src, dst, n) - for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) + if (n >= XTENSOR_OPENMP_TRESHOLD) { - *(dst + i) = static_cast(*(src + i)); +#pragma omp parallel for default(none) shared(src, dst, n) + for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) + { + *(dst + i) = static_cast(*(src + i)); + } + } + else { + for (; n > size_type(0); --n) + { + *dst = static_cast(*src); + ++src; + ++dst; + } } #else for (; n > size_type(0); --n) @@ -1007,7 +1018,7 @@ namespace xt } // std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <50) { + if (outer_loop_size>20) { std::size_t first_step = true; #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) for (std::size_t ox = 0; ox < outer_loop_size; ++ox) From 1531f81d5ba318f96367c9e7b7d8862ca80536c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= Date: Wed, 30 Nov 2022 18:02:08 +0100 Subject: [PATCH 08/33] Fix typo on xassign.hpp --- include/xtensor/xassign.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 3e1f0af9e..87dc9e3c5 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -642,7 +642,7 @@ namespace xt e1.template store_simd(i, e2.template load_simd(i)); }); #elif defined(XTENSOR_USE_OPENMP) - if (size >= size_typeXTENSOR_OPENMP_TRESHOLD)) + if (size >= size_type(XTENSOR_OPENMP_TRESHOLD)) { #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2) #ifndef _WIN32 From 198b3852c7289124a35f2cd91ef2df345d6e330d Mon Sep 17 00:00:00 2001 From: Ewoud Date: Wed, 1 Feb 2023 02:55:20 +0100 Subject: [PATCH 09/33] Fix wrong merge --- include/xtensor/xview.hpp | 624 ++++++++++++++++++++++---------------- 1 file changed, 356 insertions(+), 268 deletions(-) diff --git a/include/xtensor/xview.hpp b/include/xtensor/xview.hpp index 931bb67a9..52b7c33c0 100644 --- a/include/xtensor/xview.hpp +++ b/include/xtensor/xview.hpp @@ -1,11 +1,11 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * -* Copyright (c) QuantStack * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * + * Copyright (c) QuantStack * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XTENSOR_VIEW_HPP #define XTENSOR_VIEW_HPP @@ -18,8 +18,8 @@ #include #include -#include #include +#include #include #include "xaccessible.hpp" @@ -34,7 +34,6 @@ #include "xtensor_forward.hpp" #include "xview_utils.hpp" - namespace xt { @@ -76,26 +75,22 @@ namespace xt { template - struct is_xrange - : std::false_type + struct is_xrange : std::false_type { }; template - struct is_xrange> - : std::true_type + struct is_xrange> : std::true_type { }; template - struct is_xall_slice - : std::false_type + struct is_xall_slice : std::false_type { }; template - struct is_xall_slice> - : std::true_type + struct is_xall_slice> : std::true_type { }; @@ -134,32 +129,33 @@ namespace xt template struct is_xscalar_impl> { - static constexpr bool value = static_cast(integral_count()) == static_dimension::shape_type>::value ? true : false; + static constexpr bool value = static_cast(integral_count() + ) == static_dimension::shape_type>::value + ? true + : false; }; template - struct is_strided_slice_impl - : std::true_type + struct is_strided_slice_impl : std::true_type { }; template - struct is_strided_slice_impl> - : std::false_type + struct is_strided_slice_impl> : std::false_type { }; template - struct is_strided_slice_impl> - : std::false_type + struct is_strided_slice_impl> : std::false_type { }; // If we have no discontiguous slices, we can calculate strides for this view. template struct is_strided_view - : std::integral_constant, - is_strided_slice_impl>...>::value> + : std::integral_constant< + bool, + xtl::conjunction, is_strided_slice_impl>...>::value> { }; @@ -183,7 +179,10 @@ namespace xt static constexpr bool have_all_seen = all_seen || is_all_slice; static constexpr bool have_range_seen = is_range_slice; - static constexpr bool is_valid = valid && (have_all_seen ? is_all_slice : (!range_seen && (is_int_slice || is_range_slice))); + static constexpr bool is_valid = valid + && (have_all_seen + ? is_all_slice + : (!range_seen && (is_int_slice || is_range_slice))); static constexpr bool value = is_contiguous_view_impl>::value; + static constexpr bool is_valid = valid + && (have_int_seen + ? is_int_slice + : (!range_seen && (is_all_slice || is_range_slice))); + static constexpr bool value = is_contiguous_view_impl < layout_type::column_major, is_valid, + have_int_seen, is_range_slice || range_seen, + xtl::mpl::pop_front_t < V >> ::value; }; template @@ -234,11 +234,14 @@ namespace xt // TODO relax has_data_interface constraint here! template struct is_contiguous_view - : std::integral_constant::value && - !(E::static_layout == layout_type::column_major && static_cast(static_dimension::value) != sizeof...(S)) && - is_contiguous_view_impl>::value - > + : std::integral_constant< + bool, + has_data_interface::value + && !( + E::static_layout == layout_type::column_major + && static_cast(static_dimension::value) != sizeof...(S) + ) + && is_contiguous_view_impl>::value> { }; @@ -276,21 +279,20 @@ namespace xt struct get_contigous_shape_type { // if we have no `range` in the slices we can re-use the shape with an offset - using type = std::conditional_t...>::value, - typename xview_shape_type::type, - // In the false branch we know that we have only integers at the front OR end, and NO range - typename unwrap_offset_container()>::type>; + using type = std::conditional_t< + xtl::disjunction...>::value, + typename xview_shape_type::type, + // In the false branch we know that we have only integers at the front OR end, and NO range + typename unwrap_offset_container()>::type>; }; template - struct is_sequence_view - : std::integral_constant + struct is_sequence_view : std::integral_constant { }; template - struct is_sequence_view> - : std::integral_constant + struct is_sequence_view> : std::integral_constant { }; } @@ -304,15 +306,16 @@ namespace xt using size_type = typename xexpression_type::size_type; using temporary_type = view_temporary_type_t; - static constexpr layout_type layout = - detail::is_contiguous_view::value ? - xexpression_type::static_layout : layout_type::dynamic; + static constexpr layout_type layout = detail::is_contiguous_view::value + ? xexpression_type::static_layout + : layout_type::dynamic; static constexpr bool is_const = std::is_const>::value; - using extract_storage_type = xtl::mpl::eval_if_t, - detail::expr_storage_type, - make_invalid_type<>>; + using extract_storage_type = xtl::mpl::eval_if_t< + has_data_interface, + detail::expr_storage_type, + make_invalid_type<>>; using storage_type = std::conditional_t; }; @@ -324,17 +327,20 @@ namespace xt static constexpr bool is_strided_view = detail::is_strided_view::value; static constexpr bool is_contiguous_view = detail::is_contiguous_view::value; - using inner_shape_type = std::conditional_t::type, - typename xview_shape_type::type>; + using inner_shape_type = std::conditional_t< + is_contiguous_view, + typename detail::get_contigous_shape_type::type, + typename xview_shape_type::type>; - using stepper = std::conditional_t>, - xview_stepper>::value, CT, S...>>; + using stepper = std::conditional_t< + is_strided_view, + xstepper>, + xview_stepper>::value, CT, S...>>; - using const_stepper = std::conditional_t>, - xview_stepper, S...>>; + using const_stepper = std::conditional_t< + is_strided_view, + xstepper>, + xview_stepper, S...>>; }; /** @@ -354,9 +360,9 @@ namespace xt template class xview : public xview_semantic>, public std::conditional_t< - detail::is_contiguous_view, S...>::value, - xcontiguous_iterable>, - xiterable>>, + detail::is_contiguous_view, S...>::value, + xcontiguous_iterable>, + xiterable>>, public xaccessible>, public extension::xview_base_t { @@ -378,9 +384,8 @@ namespace xt using bool_load_type = typename xexpression_type::bool_load_type; using reference = typename inner_types::reference; using const_reference = typename inner_types::const_reference; - using pointer = std::conditional_t; + using pointer = std:: + conditional_t; using const_pointer = typename xexpression_type::const_pointer; using size_type = typename inner_types::size_type; using difference_type = typename xexpression_type::difference_type; @@ -395,28 +400,35 @@ namespace xt using inner_shape_type = typename iterable_base::inner_shape_type; using shape_type = typename xview_shape_type::type; - using xexpression_inner_strides_type = xtl::mpl::eval_if_t, - detail::expr_inner_strides_type, - get_strides_type>; + using xexpression_inner_strides_type = xtl::mpl::eval_if_t< + has_strides, + detail::expr_inner_strides_type, + get_strides_type>; - using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t, - detail::expr_inner_backstrides_type, - get_strides_type>; + using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t< + has_strides, + detail::expr_inner_backstrides_type, + get_strides_type>; using storage_type = typename inner_types::storage_type; - static constexpr bool has_trivial_strides = is_contiguous_view && !xtl::disjunction...>::value; - using inner_strides_type = std::conditional_t()>::type, - get_strides_t>; - - using inner_backstrides_type = std::conditional_t()>::type, - get_strides_t>; + static constexpr bool has_trivial_strides = is_contiguous_view + && !xtl::disjunction...>::value; + using inner_strides_type = std::conditional_t< + has_trivial_strides, + typename detail::unwrap_offset_container< + xexpression_type::static_layout, + xexpression_inner_strides_type, + integral_count()>::type, + get_strides_t>; + + using inner_backstrides_type = std::conditional_t< + has_trivial_strides, + typename detail::unwrap_offset_container< + xexpression_type::static_layout, + xexpression_inner_backstrides_type, + integral_count()>::type, + get_strides_t>; using strides_type = get_strides_t; using backstrides_type = strides_type; @@ -427,21 +439,21 @@ namespace xt using stepper = typename iterable_base::stepper; using const_stepper = typename iterable_base::const_stepper; - using linear_iterator = std::conditional_t::value && is_strided_view, - std::conditional_t, - typename iterable_base::linear_iterator>; - using const_linear_iterator = std::conditional_t::value && is_strided_view, - typename xexpression_type::const_linear_iterator, - typename iterable_base::const_linear_iterator>; + using linear_iterator = std::conditional_t< + has_data_interface::value && is_strided_view, + std::conditional_t, + typename iterable_base::linear_iterator>; + using const_linear_iterator = std::conditional_t< + has_data_interface::value && is_strided_view, + typename xexpression_type::const_linear_iterator, + typename iterable_base::const_linear_iterator>; using reverse_linear_iterator = std::reverse_iterator; using const_reverse_linear_iterator = std::reverse_iterator; using container_iterator = pointer; using const_container_iterator = const_pointer; - constexpr static std::size_t rank = SIZE_MAX; + static constexpr std::size_t rank = SIZE_MAX; // The FSL argument prevents the compiler from calling this constructor // instead of the copy constructor when sizeof...(SL) == 0. @@ -490,48 +502,36 @@ namespace xt bool has_linear_assign(const ST& strides) const; template - std::enable_if_t - stepper_begin(const ST& shape); + std::enable_if_t stepper_begin(const ST& shape); template - std::enable_if_t - stepper_end(const ST& shape, layout_type l); + std::enable_if_t stepper_end(const ST& shape, layout_type l); template - std::enable_if_t - stepper_begin(const ST& shape) const; + std::enable_if_t stepper_begin(const ST& shape) const; template - std::enable_if_t - stepper_end(const ST& shape, layout_type l) const; + std::enable_if_t stepper_end(const ST& shape, layout_type l) const; template - std::enable_if_t - stepper_begin(const ST& shape); + std::enable_if_t stepper_begin(const ST& shape); template - std::enable_if_t - stepper_end(const ST& shape, layout_type l); + std::enable_if_t stepper_end(const ST& shape, layout_type l); template - std::enable_if_t - stepper_begin(const ST& shape) const; + std::enable_if_t stepper_begin(const ST& shape) const; template - std::enable_if_t - stepper_end(const ST& shape, layout_type l) const; + std::enable_if_t stepper_end(const ST& shape, layout_type l) const; template - std::enable_if_t::value, storage_type&> - storage(); + std::enable_if_t::value, storage_type&> storage(); template - std::enable_if_t::value, const storage_type&> - storage() const; + std::enable_if_t::value, const storage_type&> storage() const; template - std::enable_if_t::value && is_strided_view, linear_iterator> - linear_begin(); + std::enable_if_t::value && is_strided_view, linear_iterator> linear_begin(); template - std::enable_if_t::value && is_strided_view, linear_iterator> - linear_end(); + std::enable_if_t::value && is_strided_view, linear_iterator> linear_end(); template std::enable_if_t::value && is_strided_view, const_linear_iterator> @@ -582,12 +582,10 @@ namespace xt backstrides() const; template - std::enable_if_t::value && is_strided_view, const_pointer> - data() const; + std::enable_if_t::value && is_strided_view, const_pointer> data() const; template - std::enable_if_t::value && is_strided_view, pointer> - data(); + std::enable_if_t::value && is_strided_view, pointer> data(); template std::enable_if_t::value && is_strided_view, std::size_t> @@ -620,11 +618,13 @@ namespace xt size_type underlying_size(size_type dim) const; xtl::xclosure_pointer operator&() &; - xtl::xclosure_pointer operator&() const &; + xtl::xclosure_pointer operator&() const&; xtl::xclosure_pointer operator&() &&; - template ::value && is_contiguous_view, int>> + template < + class E, + class T = xexpression_type, + class = std::enable_if_t::value && is_contiguous_view, int>> void assign_to(xexpression& e, bool force_resize) const; template @@ -646,9 +646,11 @@ namespace xt template enable_simd_interface store_simd(size_type i, const simd& e); - template ::size, - class T = xexpression_type> + template < + class align, + class requested_type = value_type, + std::size_t N = xt_simd::simd_traits::size, + class T = xexpression_type> enable_simd_interface> load_simd(size_type i) const; template @@ -789,9 +791,7 @@ namespace xt { public: - using view_type = std::conditional_t, - xview>; + using view_type = std::conditional_t, xview>; using substepper_type = get_stepper; using value_type = typename substepper_type::value_type; @@ -803,8 +803,13 @@ namespace xt using shape_type = typename substepper_type::shape_type; xview_stepper() = default; - xview_stepper(view_type* view, substepper_type it, - size_type offset, bool end = false, layout_type l = XTENSOR_DEFAULT_TRAVERSAL); + xview_stepper( + view_type* view, + substepper_type it, + size_type offset, + bool end = false, + layout_type l = XTENSOR_DEFAULT_TRAVERSAL + ); reference operator*() const; @@ -887,7 +892,7 @@ namespace xt std::forward(e), std::forward(first_slice), std::forward(slices)... - ) + ) { } @@ -895,25 +900,26 @@ namespace xt template template xview::xview(std::true_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept - : m_e(std::forward(e)), - m_slices(std::forward(first_slice), std::forward(slices)...), - m_shape(compute_shape(detail::is_sequence_view{})), - m_strides(m_e.strides()), - m_backstrides(m_e.backstrides()), - m_data_offset(data_offset_impl(std::make_index_sequence())), - m_strides_computed(true) + : m_e(std::forward(e)) + , m_slices(std::forward(first_slice), std::forward(slices)...) + , m_shape(compute_shape(detail::is_sequence_view{})) + , m_strides(m_e.strides()) + , m_backstrides(m_e.backstrides()) + , m_data_offset(data_offset_impl(std::make_index_sequence())) + , m_strides_computed(true) { } template template xview::xview(std::false_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept - : m_e(std::forward(e)), - m_slices(std::forward(first_slice), std::forward(slices)...), - m_shape(compute_shape(std::false_type{})), - m_strides_computed(false) + : m_e(std::forward(e)) + , m_slices(std::forward(first_slice), std::forward(slices)...) + , m_shape(compute_shape(std::false_type{})) + , m_strides_computed(false) { } + //@} template @@ -936,6 +942,7 @@ namespace xt { return semantic_base::operator=(e); } + //@} template @@ -974,13 +981,8 @@ namespace xt template inline layout_type xview::layout() const noexcept { - return xtl::mpl::static_if([&](auto self) - { - if (static_layout != layout_type::dynamic) - { - return static_layout; - } - else + return xtl::mpl::static_if( + [&](auto self) { return self(this)->m_e.layout()==layout_type::row_major ? ((self(this)->strides()[self(this)->strides().size()-1] == 1) ? self(this)->m_e.layout() : layout_type::dynamic) : @@ -988,8 +990,7 @@ namespace xt bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true); return strides_match ? self(this)->m_e.layout() : layout_type::dynamic; - } - }, + }, /* else */ [&](auto /*self*/) { std::cout << "View is not strided!" << std::endl; @@ -1035,13 +1036,17 @@ namespace xt template inline void xview::fill(const T& value) { - xtl::mpl::static_if([&](auto self) - { - std::fill(self(this)->linear_begin(), self(this)->linear_end(), value); - }, /*else*/ [&](auto self) - { - std::fill(self(this)->begin(), self(this)->end(), value); - }); + xtl::mpl::static_if( + [&](auto self) + { + std::fill(self(this)->linear_begin(), self(this)->linear_end(), value); + }, + /*else*/ + [&](auto self) + { + std::fill(self(this)->begin(), self(this)->end(), value); + } + ); } /** @@ -1087,7 +1092,6 @@ namespace xt return unchecked_impl(make_index_sequence(args...), static_cast(args)...); } - template template inline auto xview::element(It first, It last) -> reference @@ -1175,16 +1179,15 @@ namespace xt */ template template - inline auto xview::storage() -> - std::enable_if_t::value, storage_type&> + inline auto xview::storage() -> std::enable_if_t::value, storage_type&> { return m_e.storage(); } template template - inline auto xview::storage() const -> - std::enable_if_t::value, const storage_type&> + inline auto xview::storage() const + -> std::enable_if_t::value, const storage_type&> { return m_e.storage(); } @@ -1290,8 +1293,8 @@ namespace xt */ template template - inline auto xview::strides() const -> - std::enable_if_t::value && is_strided_view, const inner_strides_type&> + inline auto xview::strides() const + -> std::enable_if_t::value && is_strided_view, const inner_strides_type&> { if (!m_strides_computed) { @@ -1303,8 +1306,8 @@ namespace xt template template - inline auto xview::backstrides() const -> - std::enable_if_t::value && is_strided_view, const inner_strides_type&> + inline auto xview::backstrides() const + -> std::enable_if_t::value && is_strided_view, const inner_strides_type&> { if (!m_strides_computed) { @@ -1319,16 +1322,16 @@ namespace xt */ template template - inline auto xview::data() const -> - std::enable_if_t::value && is_strided_view, const_pointer> + inline auto xview::data() const + -> std::enable_if_t::value && is_strided_view, const_pointer> { return m_e.data(); } template template - inline auto xview::data() -> - std::enable_if_t::value && is_strided_view, pointer> + inline auto xview::data() + -> std::enable_if_t::value && is_strided_view, pointer> { return m_e.data(); } @@ -1337,9 +1340,9 @@ namespace xt template inline std::size_t xview::data_offset_impl(std::index_sequence) const noexcept { - auto temp = std::array({ - (static_cast(xt::value(std::get(m_slices), 0)))... - }); + auto temp = std::array( + {(static_cast(xt::value(std::get(m_slices), 0)))...} + ); std::ptrdiff_t result = 0; std::size_t i = 0; @@ -1359,8 +1362,8 @@ namespace xt */ template template - inline auto xview::data_offset() const noexcept -> - std::enable_if_t::value && is_strided_view, std::size_t> + inline auto xview::data_offset() const noexcept + -> std::enable_if_t::value && is_strided_view, std::size_t> { if (!m_strides_computed) { @@ -1369,6 +1372,7 @@ namespace xt } return m_data_offset; } + //@} template @@ -1384,7 +1388,7 @@ namespace xt } template - inline auto xview::operator&() const & -> xtl::xclosure_pointer + inline auto xview::operator&() const& -> xtl::xclosure_pointer { return xtl::closure_pointer(*this); } @@ -1421,15 +1425,20 @@ namespace xt template inline bool xview::has_linear_assign(const ST& str) const { - return xtl::mpl::static_if([&](auto self) - { - return str.size() == self(this)->strides().size() && - std::equal(str.cbegin(), str.cend(), self(this)->strides().begin()); - - }, /*else*/ [](auto /*self*/){ - return false; - }); + return xtl::mpl::static_if( + [&](auto self) + { + return str.size() == self(this)->strides().size() + && std::equal(str.cbegin(), str.cend(), self(this)->strides().begin()); + }, + /*else*/ + [](auto /*self*/) + { + return false; + } + ); } + //@} template @@ -1465,7 +1474,8 @@ namespace xt } template - inline auto xview::data_xend(layout_type l, size_type offset) const noexcept -> const_container_iterator + inline auto xview::data_xend(layout_type l, size_type offset) const noexcept + -> const_container_iterator { return data_xend_impl(data() + data_offset(), l, offset); } @@ -1503,7 +1513,8 @@ namespace xt template template - inline auto xview::load_simd(size_type i) const -> enable_simd_interface> + inline auto xview::load_simd(size_type i) const + -> enable_simd_interface> { return m_e.template load_simd(data_offset() + i); } @@ -1542,9 +1553,10 @@ namespace xt template inline auto xview::make_index_sequence(Args...) const noexcept { - return std::make_index_sequence<(sizeof...(Args)+integral_count() > newaxis_count() ? - sizeof...(Args)+integral_count() - newaxis_count() : - 0)>(); + return std::make_index_sequence< + (sizeof...(Args) + integral_count() > newaxis_count() + ? sizeof...(Args) + integral_count() - newaxis_count() + : 0)>(); } template @@ -1552,11 +1564,12 @@ namespace xt inline auto xview::compute_strides_impl(std::index_sequence) const noexcept { std::size_t original_dim = m_e.dimension(); - return std::array({ - (static_cast(xt::step_size(std::get(I)>(m_slices), 1)) * - ((integral_skip(I) - newaxis_count_before(integral_skip(I))) < original_dim ? - m_e.strides()[integral_skip(I) - newaxis_count_before(integral_skip(I))] : 1))... - }); + return std::array( + {(static_cast(xt::step_size(std::get(I)>(m_slices), 1)) + * ((integral_skip(I) - newaxis_count_before(integral_skip(I))) < original_dim + ? m_e.strides()[integral_skip(I) - newaxis_count_before(integral_skip(I))] + : 1))...} + ); } template @@ -1632,7 +1645,8 @@ namespace xt template template ::size_type... I, class... Args> - inline auto xview::unchecked_impl(std::index_sequence, Args... args) const -> const_reference + inline auto xview::unchecked_impl(std::index_sequence, Args... args) const + -> const_reference { return m_e.unchecked(index(args...)...); } @@ -1653,15 +1667,19 @@ namespace xt template template ::size_type I, class... Args> - inline auto xview::index(Args... args) const -> std::enable_if_t::value, size_type> + inline auto xview::index(Args... args) const + -> std::enable_if_t::value, size_type> { - return sliced_access(I) + newaxis_count_before(I + 1)> - (std::get(I + 1)>(m_slices), args...); + return sliced_access(I) + newaxis_count_before(I + 1)>( + std::get(I + 1)>(m_slices), + args... + ); } template template ::size_type I, class... Args> - inline auto xview::index(Args... args) const -> std::enable_if_t::value, size_type> + inline auto xview::index(Args... args) const + -> std::enable_if_t::value, size_type> { return argument() + newaxis_count()>(args...); } @@ -1678,7 +1696,8 @@ namespace xt inline auto xview::sliced_access(const xslice& slice, Arg arg, Args... args) const -> size_type { using ST = typename T::size_type; - return static_cast(slice.derived_cast()(argument(static_cast(arg), static_cast(args)...))); + return static_cast(slice.derived_cast( + )(argument(static_cast(arg), static_cast(args)...))); } template @@ -1695,14 +1714,17 @@ namespace xt auto index = xtl::make_sequence(m_e.dimension(), 0); using diff_type = typename std::iterator_traits::difference_type; using ivalue_type = typename base_index_type::value_type; - auto func1 = [&first](const auto& s) noexcept { + auto func1 = [&first](const auto& s) noexcept + { return get_slice_value(s, first); }; - auto func2 = [](const auto& s) noexcept { + auto func2 = [](const auto& s) noexcept + { return xt::value(s, 0); }; - auto s = static_cast((std::min)(static_cast(std::distance(first, last)), this->dimension())); + auto s = static_cast((std::min + )(static_cast(std::distance(first, last)), this->dimension())); auto first_copy = last - s; for (size_type i = 0; i != m_e.dimension(); ++i) { @@ -1714,13 +1736,12 @@ namespace xt if (first < last) { - index[i] = k < sizeof...(S) ? - apply(k, func1, m_slices) : static_cast(*first); + index[i] = k < sizeof...(S) ? apply(k, func1, m_slices) + : static_cast(*first); } else { - index[i] = k < sizeof...(S) ? - apply(k, func2, m_slices) : ivalue_type(0); + index[i] = k < sizeof...(S) ? apply(k, func2, m_slices) : ivalue_type(0); } } return index; @@ -1737,12 +1758,15 @@ namespace xt { std::size_t dim = m_e.dimension() - integral_count() + newaxis_count(); auto shape = xtl::make_sequence(dim, 0); - auto func = [](const auto& s) noexcept { return get_size(s); }; + auto func = [](const auto& s) noexcept + { + return get_size(s); + }; for (size_type i = 0; i != dim; ++i) { size_type index = integral_skip(i); - shape[i] = index < sizeof...(S) ? - apply(index, func, m_slices) : m_e.shape()[index - newaxis_count_before(index)]; + shape[i] = index < sizeof...(S) ? apply(index, func, m_slices) + : m_e.shape()[index - newaxis_count_before(index)]; } return shape; } @@ -1756,7 +1780,8 @@ namespace xt } template - inline void run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */) + inline void + run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */) { std::copy(t.cbegin(), t.cend(), v.begin()); } @@ -1765,8 +1790,8 @@ namespace xt template inline void xview::assign_temporary_impl(temporary_type&& tmp) { - constexpr bool fast_assign = detail::is_strided_view::value && \ - xassign_traits, temporary_type>::simd_strided_assign(); + constexpr bool fast_assign = detail::is_strided_view::value + && xassign_traits, temporary_type>::simd_strided_assign(); xview_detail::run_assign_temporary_impl(*this, tmp, std::integral_constant{}); } @@ -1799,8 +1824,13 @@ namespace xt { // Checks that no ellipsis slice is used using view_type = xview, get_slice_type, S>...>; - return view_type(std::forward(e), - get_slice_implementation(e, std::forward(slices), get_underlying_shape_index, S...>(I))... + return view_type( + std::forward(e), + get_slice_implementation( + e, + std::forward(slices), + get_underlying_shape_index, S...>(I) + )... ); } } @@ -1817,7 +1847,11 @@ namespace xt template inline auto view(E&& e, S&&... slices) { - return detail::make_view_impl(std::forward(e), std::make_index_sequence(), std::forward(slices)...); + return detail::make_view_impl( + std::forward(e), + std::make_index_sequence(), + std::forward(slices)... + ); } namespace detail @@ -1825,8 +1859,9 @@ namespace xt class row_impl { public: - template - static inline auto make(E&& e, const std::ptrdiff_t index) + + template + inline static auto make(E&& e, const std::ptrdiff_t index) { const auto shape = e.shape(); check_dimension(shape); @@ -1834,17 +1869,21 @@ namespace xt } private: - template - static inline void check_dimension(const S& shape) + + template + inline static void check_dimension(const S& shape) { if (shape.size() != 2) { - XTENSOR_THROW(std::invalid_argument, "A row can only be accessed on an expression with exact two dimensions"); + XTENSOR_THROW( + std::invalid_argument, + "A row can only be accessed on an expression with exact two dimensions" + ); } } - template - static inline void check_dimension(const std::array&) + template + inline static void check_dimension(const std::array&) { static_assert(N == 2, "A row can only be accessed on an expression with exact two dimensions"); } @@ -1853,8 +1892,9 @@ namespace xt class column_impl { public: - template - static inline auto make(E&& e, const std::ptrdiff_t index) + + template + inline static auto make(E&& e, const std::ptrdiff_t index) { const auto shape = e.shape(); check_dimension(shape); @@ -1862,17 +1902,21 @@ namespace xt } private: - template - static inline void check_dimension(const S& shape) + + template + inline static void check_dimension(const S& shape) { if (shape.size() != 2) { - XTENSOR_THROW(std::invalid_argument, "A column can only be accessed on an expression with exact two dimensions"); + XTENSOR_THROW( + std::invalid_argument, + "A column can only be accessed on an expression with exact two dimensions" + ); } } - template - static inline void check_dimension(const std::array&) + template + inline static void check_dimension(const std::array&) { static_assert(N == 2, "A column can only be accessed on an expression with exact two dimensions"); } @@ -1923,7 +1967,8 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) + -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return stepper(this, m_e.stepper_end(m_e.shape(), l), offset, true, l); @@ -1931,7 +1976,8 @@ namespace xt template template - inline auto xview::stepper_begin(const ST& shape) const -> std::enable_if_t + inline auto xview::stepper_begin(const ST& shape) const + -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); const xexpression_type& e = m_e; @@ -1940,7 +1986,8 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) const -> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) const + -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); const xexpression_type& e = m_e; @@ -1957,7 +2004,8 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) + -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return stepper(this, data_xend(l, offset), offset); @@ -1965,7 +2013,8 @@ namespace xt template template - inline auto xview::stepper_begin(const ST& shape) const -> std::enable_if_t + inline auto xview::stepper_begin(const ST& shape) const + -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return const_stepper(this, data_xbegin(), offset); @@ -1973,7 +2022,8 @@ namespace xt template template - inline auto xview::stepper_end(const ST& shape, layout_type l) const-> std::enable_if_t + inline auto xview::stepper_end(const ST& shape, layout_type l) const + -> std::enable_if_t { size_type offset = shape.size() - this->dimension(); return const_stepper(this, data_xend(l, offset), offset); @@ -1984,14 +2034,24 @@ namespace xt ********************************/ template - inline xview_stepper::xview_stepper(view_type* view, substepper_type it, - size_type offset, bool end, layout_type l) - : p_view(view), m_it(it), m_offset(offset) + inline xview_stepper::xview_stepper( + view_type* view, + substepper_type it, + size_type offset, + bool end, + layout_type l + ) + : p_view(view) + , m_it(it) + , m_offset(offset) { if (!end) { std::fill(m_index_keeper.begin(), m_index_keeper.end(), 0); - auto func = [](const auto& s) noexcept { return xt::value(s, 0); }; + auto func = [](const auto& s) noexcept + { + return xt::value(s, 0); + }; for (size_type i = 0; i < sizeof...(S); ++i) { if (!is_newaxis_slice(i)) @@ -2017,14 +2077,18 @@ namespace xt template inline void xview_stepper::step(size_type dim) { - auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); }; + auto func = [this](size_type index, size_type offset) + { + m_it.step(index, offset); + }; common_step_forward(dim, func); } template inline void xview_stepper::step_back(size_type dim) { - auto func = [this](size_type index, size_type offset) { + auto func = [this](size_type index, size_type offset) + { m_it.step_back(index, offset); }; common_step_backward(dim, func); @@ -2033,14 +2097,18 @@ namespace xt template inline void xview_stepper::step(size_type dim, size_type n) { - auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); }; + auto func = [this](size_type index, size_type offset) + { + m_it.step(index, offset); + }; common_step_forward(dim, n, func); } template inline void xview_stepper::step_back(size_type dim, size_type n) { - auto func = [this](size_type index, size_type offset) { + auto func = [this](size_type index, size_type offset) + { m_it.step_back(index, offset); }; common_step_backward(dim, n, func); @@ -2049,14 +2117,20 @@ namespace xt template inline void xview_stepper::reset(size_type dim) { - auto func = [this](size_type index, size_type offset) { m_it.step_back(index, offset); }; + auto func = [this](size_type index, size_type offset) + { + m_it.step_back(index, offset); + }; common_reset(dim, func, false); } template inline void xview_stepper::reset_back(size_type dim) { - auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); }; + auto func = [this](size_type index, size_type offset) + { + m_it.step(index, offset); + }; common_reset(dim, func, true); } @@ -2084,10 +2158,12 @@ namespace xt template inline void xview_stepper::to_end_impl(layout_type l) { - auto func = [](const auto& s) noexcept { + auto func = [](const auto& s) noexcept + { return xt::value(s, get_size(s) - 1); }; - auto size_func = [](const auto& s) noexcept { + auto size_func = [](const auto& s) noexcept + { return get_size(s); }; @@ -2137,14 +2213,15 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, this](const auto& s) noexcept { + auto func = [&dim, this](const auto& s) noexcept + { return step_size(s, this->m_index_keeper[dim]++, 1); }; size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? - apply(index, func, p_view->slices()) : 1; + size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) + : 1; index -= newaxis_count_before(index); f(index, step_size); } @@ -2157,7 +2234,8 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, &n, this](const auto& s) noexcept { + auto func = [&dim, &n, this](const auto& s) noexcept + { auto st_size = step_size(s, this->m_index_keeper[dim], n); this->m_index_keeper[dim] += n; return size_type(st_size); @@ -2166,8 +2244,8 @@ namespace xt size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? - apply(index, func, p_view->slices()) : n; + size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) + : n; index -= newaxis_count_before(index); f(index, step_size); } @@ -2180,15 +2258,16 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, this](const auto& s) noexcept { + auto func = [&dim, this](const auto& s) noexcept + { this->m_index_keeper[dim]--; return step_size(s, this->m_index_keeper[dim], 1); }; size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? - apply(index, func, p_view->slices()) : 1; + size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) + : 1; index -= newaxis_count_before(index); f(index, step_size); } @@ -2201,7 +2280,8 @@ namespace xt { if (dim >= m_offset) { - auto func = [&dim, &n, this](const auto& s) noexcept { + auto func = [&dim, &n, this](const auto& s) noexcept + { this->m_index_keeper[dim] -= n; return step_size(s, this->m_index_keeper[dim], n); }; @@ -2209,8 +2289,8 @@ namespace xt size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { - size_type step_size = index < sizeof...(S) ? - apply(index, func, p_view->slices()) : n; + size_type step_size = index < sizeof...(S) ? apply(index, func, p_view->slices()) + : n; index -= newaxis_count_before(index); f(index, step_size); } @@ -2221,19 +2301,27 @@ namespace xt template void xview_stepper::common_reset(size_type dim, F f, bool backwards) { - auto size_func = [](const auto& s) noexcept { return get_size(s); }; - auto end_func = [](const auto& s) noexcept { return xt::value(s, get_size(s) - 1) - xt::value(s, 0); }; + auto size_func = [](const auto& s) noexcept + { + return get_size(s); + }; + auto end_func = [](const auto& s) noexcept + { + return xt::value(s, get_size(s) - 1) - xt::value(s, 0); + }; size_type index = integral_skip(dim); if (!is_newaxis_slice(index)) { if (dim < m_index_keeper.size()) { - size_type size = index < sizeof...(S) ? apply(index, size_func, p_view->slices()) : p_view->shape()[dim]; + size_type size = index < sizeof...(S) ? apply(index, size_func, p_view->slices()) + : p_view->shape()[dim]; m_index_keeper[dim] = backwards ? size - 1 : 0; } - size_type reset_n = index < sizeof...(S) ? apply(index, end_func, p_view->slices()) : p_view->shape()[dim] - 1; + size_type reset_n = index < sizeof...(S) ? apply(index, end_func, p_view->slices()) + : p_view->shape()[dim] - 1; index -= newaxis_count_before(index); f(index, reset_n); } From aa9d078e2a53c3d00aa0fd2a88748ff6fca63dd3 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Wed, 1 Feb 2023 11:04:52 +0100 Subject: [PATCH 10/33] Try at fixed strided loop assignment --- include/xtensor/xassign.hpp | 395 ++++++++++++++++++++++++------------ 1 file changed, 269 insertions(+), 126 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 87dc9e3c5..5b7e513ac 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1,16 +1,17 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * -* Copyright (c) QuantStack * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * + * Copyright (c) QuantStack * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XTENSOR_ASSIGN_HPP #define XTENSOR_ASSIGN_HPP #include +#include #include #include @@ -18,12 +19,12 @@ #include #include "xexpression.hpp" +#include "xfunction.hpp" #include "xiterator.hpp" #include "xstrides.hpp" #include "xtensor_config.hpp" #include "xtensor_forward.hpp" #include "xutils.hpp" -#include "xfunction.hpp" #if defined(XTENSOR_USE_TBB) #include @@ -99,7 +100,6 @@ namespace xt template static bool resize(E1& e1, const xfunction& e2); - }; /******************** @@ -195,14 +195,18 @@ namespace xt template inline void assign_xexpression(xexpression& e1, const xexpression& e2) { - xtl::mpl::static_if::value>([&](auto self) - { - self(e2).derived_cast().assign_to(e1); - }, /*else*/ [&](auto /*self*/) - { - using tag = xexpression_tag_t; - xexpression_assigner::assign_xexpression(e1, e2); - }); + xtl::mpl::static_if::value>( + [&](auto self) + { + self(e2).derived_cast().assign_to(e1); + }, + /*else*/ + [&](auto /*self*/) + { + using tag = xexpression_tag_t; + xexpression_assigner::assign_xexpression(e1, e2); + } + ); } template @@ -238,12 +242,16 @@ namespace xt // A row_major or column_major container with a dimension <= 1 is computed as // layout any, leading to some performance improvements, for example when // assigning a col-major vector to a row-major vector etc - return compute_layout(select_layout::value, - select_layout::value) != layout_type::dynamic; + return compute_layout( + select_layout::value, + select_layout::value + ) + != layout_type::dynamic; } template - inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t::value, bool> + inline auto is_linear_assign(const E1& e1, const E2& e2) + -> std::enable_if_t::value, bool> { return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout()) || (e1.is_contiguous() && e2.has_linear_assign(e1.strides())); @@ -258,9 +266,8 @@ namespace xt template inline bool linear_dynamic_layout(const E1& e1, const E2& e2) { - return e1.is_contiguous() - && e2.is_contiguous() - && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic; + return e1.is_contiguous() && e2.is_contiguous() + && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic; } template @@ -269,16 +276,20 @@ namespace xt }; template - struct has_step_leading().step_leading())>> - : std::true_type + struct has_step_leading().step_leading())>> : std::true_type { }; template struct use_strided_loop { - static constexpr bool stepper_deref() { return std::is_reference::value; } - static constexpr bool value = has_strides::value && has_step_leading::value && stepper_deref(); + static constexpr bool stepper_deref() + { + return std::is_reference::value; + } + + static constexpr bool value = has_strides::value + && has_step_leading::value && stepper_deref(); }; template @@ -332,43 +343,89 @@ namespace xt template using is_bool = std::is_same; - static constexpr bool is_bool_conversion() { return is_bool::value && !is_bool::value; } - static constexpr bool contiguous_layout() { return E1::contiguous_layout && E2::contiguous_layout; } - static constexpr bool convertible_types() { return std::is_convertible::value - && !is_bool_conversion(); } + static constexpr bool is_bool_conversion() + { + return is_bool::value && !is_bool::value; + } + + static constexpr bool contiguous_layout() + { + return E1::contiguous_layout && E2::contiguous_layout; + } + + static constexpr bool convertible_types() + { + return std::is_convertible::value && !is_bool_conversion(); + } - static constexpr bool use_xsimd() { return xt_simd::simd_traits::size > 1; } + static constexpr bool use_xsimd() + { + return xt_simd::simd_traits::size > 1; + } template - static constexpr bool simd_size_impl() { return xt_simd::simd_traits::size > 1 || (is_bool::value && use_xsimd()); } - static constexpr bool simd_size() { return simd_size_impl() && simd_size_impl(); } - static constexpr bool simd_interface() { return has_simd_interface() && - has_simd_interface(); } + static constexpr bool simd_size_impl() + { + return xt_simd::simd_traits::size > 1 || (is_bool::value && use_xsimd()); + } + + static constexpr bool simd_size() + { + return simd_size_impl() && simd_size_impl(); + } + + static constexpr bool simd_interface() + { + return has_simd_interface() + && has_simd_interface(); + } public: // constexpr methods instead of constexpr data members avoid the need of definitions at namespace // scope of these data members (since they are odr-used). - static constexpr bool simd_assign() { return convertible_types() && simd_size() && simd_interface(); } - static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial) { return trivial && detail::is_linear_assign(e1, e2); } - static constexpr bool strided_assign() { return detail::use_strided_loop::value && detail::use_strided_loop::value; } - static constexpr bool simd_linear_assign() { return contiguous_layout() && simd_assign(); } - static constexpr bool simd_strided_assign() { return strided_assign() && simd_assign(); } + static constexpr bool simd_assign() + { + return convertible_types() && simd_size() && simd_interface(); + } + + static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial) + { + return trivial && detail::is_linear_assign(e1, e2); + } + + static constexpr bool strided_assign() + { + return detail::use_strided_loop::value && detail::use_strided_loop::value; + } + + static constexpr bool simd_linear_assign() + { + return contiguous_layout() && simd_assign(); + } - static constexpr bool simd_linear_assign(const E1& e1, const E2& e2) { return simd_assign() - && detail::linear_dynamic_layout(e1, e2); } + static constexpr bool simd_strided_assign() + { + return strided_assign() && simd_assign(); + } - using e2_requested_value_type = std::conditional_t::value, - typename E2::bool_load_type, - e2_value_type>; - using requested_value_type = detail::conditional_promote_to_complex_t; + static constexpr bool simd_linear_assign(const E1& e1, const E2& e2) + { + return simd_assign() && detail::linear_dynamic_layout(e1, e2); + } + using e2_requested_value_type = std:: + conditional_t::value, typename E2::bool_load_type, e2_value_type>; + using requested_value_type = detail::conditional_promote_to_complex_t; }; template - inline void xexpression_assigner_base::assign_data(xexpression& e1, const xexpression& e2, bool trivial) + inline void xexpression_assigner_base::assign_data( + xexpression& e1, + const xexpression& e2, + bool trivial + ) { E1& de1 = e1.derived_cast(); const E2& de2 = e2.derived_cast(); @@ -378,9 +435,10 @@ namespace xt constexpr bool simd_assign = traits::simd_assign(); constexpr bool simd_linear_assign = traits::simd_linear_assign(); constexpr bool simd_strided_assign = traits::simd_strided_assign(); + constexpr bool strided_assign = traits::strided_assign(); if (linear_assign) { - if(simd_linear_assign || traits::simd_linear_assign(de1, de2)) + if (simd_linear_assign || traits::simd_linear_assign(de1, de2)) { // Do not use linear_assigner here since it will make the compiler // instantiate this branch even if the runtime condition is false, resulting @@ -394,7 +452,7 @@ namespace xt linear_assigner::run(de1, de2); } } - else if (simd_strided_assign && de1.layout() == de2.layout()) + else if (simd_strided_assign) { strided_loop_assigner::run(de1, de2); } @@ -418,16 +476,27 @@ namespace xt inline void xexpression_assigner::computed_assign(xexpression& e1, const xexpression& e2) { using shape_type = typename E1::shape_type; + using comperator_type = std::greater; + using size_type = typename E1::size_type; E1& de1 = e1.derived_cast(); const E2& de2 = e2.derived_cast(); - size_type dim = de2.dimension(); - shape_type shape = uninitialized_shape(dim); + size_type dim2 = de2.dimension(); + shape_type shape = uninitialized_shape(dim2); + bool trivial_broadcast = de2.broadcast_shape(shape, true); - if (dim > de1.dimension() || shape > de1.shape()) + auto&& de1_shape = de1.shape(); + if (dim2 > de1.dimension() + || std::lexicographical_compare( + shape.begin(), + shape.end(), + de1_shape.begin(), + de1_shape.end(), + comperator_type() + )) { typename E1::temporary_type tmp(shape); base_type::assign_data(tmp, e2, trivial_broadcast); @@ -455,7 +524,8 @@ namespace xt template template - inline void xexpression_assigner::assert_compatible_shape(const xexpression& e1, const xexpression& e2) + inline void + xexpression_assigner::assert_compatible_shape(const xexpression& e1, const xexpression& e2) { const E1& de1 = e1.derived_cast(); const E2& de2 = e2.derived_cast(); @@ -499,16 +569,20 @@ namespace xt inline bool xexpression_assigner::resize(E1& e1, const xfunction& e2) { return xtl::mpl::static_if::shape_type>::value>( - [&](auto /*self*/) { + [&](auto /*self*/) + { /* * If the shape of the xfunction is statically known, we can compute the broadcast triviality * at compile time plus we can resize right away. */ // resize in case LHS is not a fixed size container. If it is, this is a NOP e1.resize(typename xfunction::shape_type{}); - return detail::static_trivial_broadcast::shape_type>::value, CT...>::value; + return detail::static_trivial_broadcast< + detail::is_fixed::shape_type>::value, + CT...>::value; }, - /* else */ [&](auto /*self*/) + /* else */ + [&](auto /*self*/) { using index_type = xindex_type_t; using size_type = typename E1::size_type; @@ -531,9 +605,10 @@ namespace xt using argument_type = std::decay_t; using result_type = std::decay_t; - static const bool value = xtl::is_arithmetic::value && - (sizeof(result_type) < sizeof(argument_type) || - (xtl::is_integral::value && std::is_floating_point::value)); + static const bool value = xtl::is_arithmetic::value + && (sizeof(result_type) < sizeof(argument_type) + || (xtl::is_integral::value + && std::is_floating_point::value)); }; template @@ -551,15 +626,16 @@ namespace xt using argument_type = std::decay_t; using result_type = std::decay_t; - static const bool value = is_narrowing_conversion::value || - has_sign_conversion::value; + static const bool value = is_narrowing_conversion::value + || has_sign_conversion::value; }; template inline stepper_assigner::stepper_assigner(E1& e1, const E2& e2) - : m_e1(e1), m_lhs(e1.stepper_begin(e1.shape())), - m_rhs(e2.stepper_begin(e1.shape())), - m_index(xtl::make_sequence(e1.shape().size(), size_type(0))) + : m_e1(e1) + , m_lhs(e1.stepper_begin(e1.shape())) + , m_rhs(e2.stepper_begin(e1.shape())) + , m_index(xtl::make_sequence(e1.shape().size(), size_type(0))) { } @@ -637,10 +713,29 @@ namespace xt } #if defined(XTENSOR_USE_TBB) - tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i) + if (size >= XTENSOR_TBB_THRESHOLD) { - e1.template store_simd(i, e2.template load_simd(i)); - }); + static tbb::affinity_partitioner ap; + tbb::parallel_for( + align_begin, + align_end, + simd_size, + [&e1, &e2](size_t i) + { + e1.template store_simd( + i, + e2.template load_simd(i) + ); + }, ap + ); + } + else + { + for (size_type i = align_begin; i < align_end; i += simd_size) + { + e1.template store_simd(i, e2.template load_simd(i)); + } + } #elif defined(XTENSOR_USE_OPENMP) if (size >= size_type(XTENSOR_OPENMP_TRESHOLD)) { @@ -651,8 +746,8 @@ namespace xt e1.template store_simd(i, e2.template load_simd(i)); } #else - for(auto i = static_cast(align_begin); i < static_cast(align_end); - i += static_cast(simd_size)) + for (auto i = static_cast(align_begin); i < static_cast(align_end); + i += static_cast(simd_size)) { size_type ui = static_cast(i); e1.template store_simd(ui, e2.template load_simd(ui)); @@ -681,8 +776,8 @@ namespace xt template inline void linear_assigner::run(E1& e1, const E2& e2) { - using is_convertible = std::is_convertible::value_type, - typename std::decay_t::value_type>; + using is_convertible = std:: + is_convertible::value_type, typename std::decay_t::value_type>; // If the types are not compatible, this function is still instantiated but never called. // To avoid compilation problems in effectively unused code trivial_assigner_run_impl is // empty in this case. @@ -699,10 +794,14 @@ namespace xt size_type n = e1.size(); // std::cout << "Linear assigner LOOP SIZE is " << n <(n), [&](std::ptrdiff_t i) - { - *(dst + i) = static_cast(*(src + i)); - }); + tbb::parallel_for( + std::ptrdiff_t(0), + static_cast(n), + [&](std::ptrdiff_t i) + { + *(dst + i) = static_cast(*(src + i)); + } + ); #elif defined(XTENSOR_USE_OPENMP) if (n >= XTENSOR_OPENMP_TRESHOLD) { @@ -733,9 +832,7 @@ namespace xt template inline void linear_assigner::run_impl(E1&, const E2&, std::false_type /*is_convertible*/) { - XTENSOR_PRECONDITION(false, - "Internal error: linear_assigner called with unrelated types."); - + XTENSOR_PRECONDITION(false, "Internal error: linear_assigner called with unrelated types."); } /**************************************** @@ -851,14 +948,13 @@ namespace xt using strides_type = S; check_strides_functor(const S& strides) - : m_cut(L == layout_type::row_major ? 0 : strides.size()), - m_strides(strides) + : m_cut(L == layout_type::row_major ? 0 : strides.size()) + , m_strides(strides) { } template - std::enable_if_t - operator()(const T& el) + std::enable_if_t operator()(const T& el) { auto var = check_strides_overlap::get(m_strides, el.strides()); if (var > m_cut) @@ -869,8 +965,7 @@ namespace xt } template - std::enable_if_t - operator()(const T& el) + std::enable_if_t operator()(const T& el) { auto var = check_strides_overlap::get(m_strides, el.strides()); if (var < m_cut) @@ -913,18 +1008,25 @@ namespace xt } else if (E1::static_layout == layout_type::column_major || !is_row_major) { - auto csf = check_strides_functor(e1.strides()); + auto csf = check_strides_functor(e1.strides() + ); cut = csf(e2); if (cut>1) cut = 1; } // can't reach here because this would have already triggered the fallback using shape_value_type = typename E1::shape_type::value_type; - std::size_t outer_loop_size = static_cast( - std::accumulate(e1.shape().begin(), e1.shape().begin() + static_cast(cut), - shape_value_type(1), std::multiplies{})); - std::size_t inner_loop_size = static_cast( - std::accumulate(e1.shape().begin() + static_cast(cut), e1.shape().end(), - shape_value_type(1), std::multiplies{})); + std::size_t outer_loop_size = static_cast(std::accumulate( + e1.shape().begin(), + e1.shape().begin() + static_cast(cut), + shape_value_type(1), + std::multiplies{} + )); + std::size_t inner_loop_size = static_cast(std::accumulate( + e1.shape().begin() + static_cast(cut), + e1.shape().end(), + shape_value_type(1), + std::multiplies{} + )); if (E1::static_layout == layout_type::column_major || !is_row_major) { @@ -942,37 +1044,30 @@ namespace xt { bool is_row_major = true; using fallback_assigner = stepper_assigner; - if (E1::static_layout == layout_type::dynamic) - { - layout_type dynamic_layout = e1.layout(); - switch (dynamic_layout) - { - case layout_type::row_major: - is_row_major = true; - break; - case layout_type::column_major: - is_row_major = false; - break; - default: - return fallback_assigner(e1, e2).run(); - } - } - else if (E1::static_layout == layout_type::row_major) - { + bool mismatch_colrow = (e1.layout() == layout_type::row_major && e2.layout() == layout_type::column_major) + || (e2.layout() == layout_type::row_major && e1.layout() == layout_type::column_major); + if (mismatch_colrow) + return fallback_assigner(e1, e2).run(); + + bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1; + bool de1_col_contiguous = e1.strides()[0] == 1; + bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1; + bool de2_col_contiguous = e2.strides()[0] == 1; + + if (de1_row_contiguous && de2_row_contiguous) is_row_major = true; - } - else if (E1::static_layout == layout_type::column_major) - { + else if (de1_col_contiguous && de2_col_contiguous) is_row_major = false; - } else - { - XTENSOR_THROW(std::runtime_error, "Illegal layout set (layout_type::any?)."); - } + return fallback_assigner(e1, e2).run(); std::size_t inner_loop_size, outer_loop_size, cut; - std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(e1, e2, is_row_major); + std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes( + e1, + e2, + is_row_major + ); if ((is_row_major && cut == e1.dimension()) || (!is_row_major && cut == 0)) { @@ -999,9 +1094,10 @@ namespace xt using e2_value_type = typename E2::value_type; constexpr bool needs_cast = has_assign_conversion::value; using value_type = typename xassign_traits::requested_value_type; - using simd_type = std::conditional_t::value, - xt_simd::simd_bool_type, - xt_simd::simd_type>; + using simd_type = std::conditional_t< + std::is_same::value, + xt_simd::simd_bool_type, + xt_simd::simd_type>; std::size_t simd_size = inner_loop_size / simd_type::size; std::size_t simd_rest = inner_loop_size % simd_type::size; @@ -1012,7 +1108,7 @@ namespace xt // TODO in 1D case this is ambigous -- could be RM or CM. // Use default layout to make decision std::size_t step_dim = 0; - if (!is_row_major) // row major case + if (!is_row_major) // row major case { step_dim = cut; } @@ -1063,7 +1159,54 @@ namespace xt } } else { - +#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB) + if (outer_loop_size > 20) { + tbb::parallel_for(0ul, outer_loop_size, + [&, fct_stepper, res_stepper, idx](tbb::blocked_range const &r) { + std::size_t first_step = true; +//#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) + for (std::size_t ox = r.begin(); ox < r.end(); ++ox) { + if (first_step) { + is_row_major ? + strided_assign_detail::idx_tools::nth_idx(ox, idx, + max_shape) : + strided_assign_detail::idx_tools::nth_idx(ox, idx, + max_shape); + + for (std::size_t i = 0; i < idx.size(); ++i) { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + first_step = false; + } + + for (std::size_t i = 0; i < simd_size; ++i) { + res_stepper.template store_simd( + fct_stepper.template step_simd()); + } + for (std::size_t i = 0; i < simd_rest; ++i) { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } + + // next unaligned index + is_row_major ? + strided_assign_detail::idx_tools::next_idx(idx, max_shape) : + strided_assign_detail::idx_tools::next_idx(idx, max_shape); + + // need to step E1 as well if not contigous assign (e.g. view) + fct_stepper.to_begin(); + res_stepper.to_begin(); + for (std::size_t i = 0; i < idx.size(); ++i) { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + } + }); + } + else { + #endif for (std::size_t ox = 0; ox < outer_loop_size; ++ox) { @@ -1078,9 +1221,9 @@ namespace xt fct_stepper.step_leading(); } - is_row_major ? - strided_assign_detail::idx_tools::next_idx(idx, max_shape) : - strided_assign_detail::idx_tools::next_idx(idx, max_shape); + is_row_major + ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) + : strided_assign_detail::idx_tools::next_idx(idx, max_shape); fct_stepper.to_begin(); From 68ba3def8a9457fe70e923734f9f5a276b86caee Mon Sep 17 00:00:00 2001 From: Ewoud Date: Wed, 1 Feb 2023 11:42:14 +0100 Subject: [PATCH 11/33] Compiles and maybe works now.. --- include/xtensor/xassign.hpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 5b7e513ac..b82c04268 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1052,12 +1052,12 @@ namespace xt bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1; bool de1_col_contiguous = e1.strides()[0] == 1; - bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1; - bool de2_col_contiguous = e2.strides()[0] == 1; + //bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1; + //bool de2_col_contiguous = e2.strides()[0] == 1; - if (de1_row_contiguous && de2_row_contiguous) + if (de1_row_contiguous)// && de2_row_contiguous) is_row_major = true; - else if (de1_col_contiguous && de2_col_contiguous) + else if (de1_col_contiguous)// && de2_col_contiguous) is_row_major = false; else return fallback_assigner(e1, e2).run(); @@ -1161,8 +1161,11 @@ namespace xt else { #elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB) if (outer_loop_size > 20) { - tbb::parallel_for(0ul, outer_loop_size, - [&, fct_stepper, res_stepper, idx](tbb::blocked_range const &r) { + tbb::parallel_for(tbb::blocked_range(0ul, outer_loop_size), + [is_row_major, step_dim, simd_size, simd_rest, &max_shape, &fct_stepper_=fct_stepper, &res_stepper_=res_stepper, &idx_=idx](tbb::blocked_range const &r) { + auto idx = idx_; + auto res_stepper = res_stepper_; + auto fct_stepper = fct_stepper_; std::size_t first_step = true; //#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) for (std::size_t ox = r.begin(); ox < r.end(); ++ox) { @@ -1245,7 +1248,7 @@ namespace xt } } } -#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign) +#if (defined(XTENSOR_USE_OPENMP) || defined(XTENSOR_USE_TBB)) && defined(strided_parallel_assign) } #endif } From 0a62ded224c3f3f821192c5157bb6be390d299e0 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Wed, 1 Feb 2023 14:42:41 +0100 Subject: [PATCH 12/33] Use static partitioner instead --- include/xtensor/xassign.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index b82c04268..f672d244e 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -715,7 +715,7 @@ namespace xt #if defined(XTENSOR_USE_TBB) if (size >= XTENSOR_TBB_THRESHOLD) { - static tbb::affinity_partitioner ap; + tbb::static_partitioner ap; tbb::parallel_for( align_begin, align_end, @@ -794,14 +794,14 @@ namespace xt size_type n = e1.size(); // std::cout << "Linear assigner LOOP SIZE is " << n <(n), [&](std::ptrdiff_t i) { *(dst + i) = static_cast(*(src + i)); - } - ); + }, sp); #elif defined(XTENSOR_USE_OPENMP) if (n >= XTENSOR_OPENMP_TRESHOLD) { @@ -1161,6 +1161,7 @@ namespace xt else { #elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB) if (outer_loop_size > 20) { + tbb::static_partitioner sp; tbb::parallel_for(tbb::blocked_range(0ul, outer_loop_size), [is_row_major, step_dim, simd_size, simd_rest, &max_shape, &fct_stepper_=fct_stepper, &res_stepper_=res_stepper, &idx_=idx](tbb::blocked_range const &r) { auto idx = idx_; @@ -1206,7 +1207,7 @@ namespace xt res_stepper.step(i + step_dim, idx[i]); } } - }); + }, sp); } else { From 3c4e93a148cdacd01af5b9919a027f10300ad230 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Wed, 1 Feb 2023 14:52:37 +0100 Subject: [PATCH 13/33] Remove unused variable --- include/xtensor/xassign.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index f672d244e..db3d9435a 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -435,7 +435,6 @@ namespace xt constexpr bool simd_assign = traits::simd_assign(); constexpr bool simd_linear_assign = traits::simd_linear_assign(); constexpr bool simd_strided_assign = traits::simd_strided_assign(); - constexpr bool strided_assign = traits::strided_assign(); if (linear_assign) { if (simd_linear_assign || traits::simd_linear_assign(de1, de2)) From d1209f2863f095c7f18af6351ece858891cf07b6 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Wed, 1 Feb 2023 15:21:29 +0100 Subject: [PATCH 14/33] Make explicitly the steppers in teh lambda instead of a copy --- include/xtensor/xassign.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index db3d9435a..5be9f947c 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1162,10 +1162,10 @@ namespace xt if (outer_loop_size > 20) { tbb::static_partitioner sp; tbb::parallel_for(tbb::blocked_range(0ul, outer_loop_size), - [is_row_major, step_dim, simd_size, simd_rest, &max_shape, &fct_stepper_=fct_stepper, &res_stepper_=res_stepper, &idx_=idx](tbb::blocked_range const &r) { + [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_=idx](tbb::blocked_range const &r) { auto idx = idx_; - auto res_stepper = res_stepper_; - auto fct_stepper = fct_stepper_; + auto fct_stepper = e2.stepper_begin(e1.shape()); + auto res_stepper = e1.stepper_begin(e1.shape()); std::size_t first_step = true; //#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) for (std::size_t ox = r.begin(); ox < r.end(); ++ox) { From 01c8b4c00d9d57c94c8a37373ab187b20e123e99 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Thu, 2 Feb 2023 20:12:17 +0100 Subject: [PATCH 15/33] Fix the strided assigner --- include/xtensor/xassign.hpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 5be9f947c..e6a43580a 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -955,6 +955,7 @@ namespace xt template std::enable_if_t operator()(const T& el) { + // All dimenions less than var have differing strides auto var = check_strides_overlap::get(m_strides, el.strides()); if (var > m_cut) { @@ -967,6 +968,7 @@ namespace xt std::enable_if_t operator()(const T& el) { auto var = check_strides_overlap::get(m_strides, el.strides()); + // All dimensions >= var have differing strides if (var < m_cut) { m_cut = var; @@ -996,16 +998,18 @@ namespace xt template auto get_loop_sizes(const E1& e1, const E2& e2, bool is_row_major) { + // Cut is the number of dimensions in the outer loop std::size_t cut = 0; // TODO! if E1 is !contiguous --> initialize cut to sensible value! - if (E1::static_layout == layout_type::row_major || is_row_major) + if (is_row_major) { auto csf = check_strides_functor(e1.strides()); cut = csf(e2); + // This makes that only one dimension will be treated in the inner loop. if (cut(e1.strides() ); @@ -1027,7 +1031,7 @@ namespace xt std::multiplies{} )); - if (E1::static_layout == layout_type::column_major || !is_row_major) + if (!is_row_major) { std::swap(outer_loop_size, inner_loop_size); } @@ -1044,23 +1048,21 @@ namespace xt bool is_row_major = true; using fallback_assigner = stepper_assigner; - bool mismatch_colrow = (e1.layout() == layout_type::row_major && e2.layout() == layout_type::column_major) - || (e2.layout() == layout_type::row_major && e1.layout() == layout_type::column_major); - if (mismatch_colrow) - return fallback_assigner(e1, e2).run(); - + // Try to find a row-major scheme first, where the outer loop is on the first N = `cut` dimensions, + // and the inner loop is + is_row_major = true; bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1; bool de1_col_contiguous = e1.strides()[0] == 1; - //bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1; - //bool de2_col_contiguous = e2.strides()[0] == 1; - - if (de1_row_contiguous)// && de2_row_contiguous) + if (de1_row_contiguous) is_row_major = true; - else if (de1_col_contiguous)// && de2_col_contiguous) + else if (de1_col_contiguous) // If first stride is 1, and we don't have a one-dimensional container is_row_major = false; else - return fallback_assigner(e1, e2).run(); + // We might be dealing with a broadcasted dimension at the end. + // Since this is not detected explicitly, just try row-major by default + is_row_major = true; + // cut is the number of dimensions that the outer loop takes care of. std::size_t inner_loop_size, outer_loop_size, cut; std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes( e1, From e4558a630a7fba31b7652bab470f26849595c7f0 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Thu, 23 Feb 2023 17:51:18 +0100 Subject: [PATCH 16/33] Add a test, and fix a few bugs for the strided (parallel) assigner. --- include/xtensor/xassign.hpp | 518 +++++++++++++++++++-------------- include/xtensor/xcontainer.hpp | 21 +- test/CMakeLists.txt | 1 + test/test_strided_assign.cpp | 114 ++++++++ 4 files changed, 426 insertions(+), 228 deletions(-) create mode 100644 test/test_strided_assign.cpp diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index e6a43580a..4a68827b6 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -172,13 +172,29 @@ namespace xt * strided_loop_assigner * *************************/ + namespace strided_assign_detail { + struct loop_sizes_t { + bool is_row_major; + std::size_t inner_loop_size; + std::size_t outer_loop_size; + std::size_t cut; + std::size_t dimension; + inline bool can_do_strided_assign() { // Is it actually useful to do a strided assign? + return inner_loop_size > 1; + //return !((is_row_major && cut == dimension) || (!is_row_major && cut == 0)) && inner_loop_size != 1; + }; + }; + } template class strided_loop_assigner { public: - + using loop_sizes_t = strided_assign_detail::loop_sizes_t; + // is_row_major, inner_loop_size, outer_loop_size, cut template - static void run(E1& e1, const E2& e2); + static void run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes); + template + static loop_sizes_t get_loop_sizes(E1& e1, const E2& e2); }; /*********************************** @@ -253,8 +269,8 @@ namespace xt inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t::value, bool> { - return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout()) || - (e1.is_contiguous() && e2.has_linear_assign(e1.strides())); + return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout()) + || (e1.is_contiguous() && e2.has_linear_assign(e1.strides())); } template @@ -451,14 +467,20 @@ namespace xt linear_assigner::run(de1, de2); } } - else if (simd_strided_assign) - { - strided_loop_assigner::run(de1, de2); - } else { - stepper_assigner assigner(de1, de2); - assigner.run(); + if (simd_strided_assign) + { + strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner::get_loop_sizes(de1, de2); + if (loop_sizes.can_do_strided_assign()) + strided_loop_assigner::run(de1, de2, loop_sizes); + else + stepper_assigner(de1, de2).run(); + } + else + { + stepper_assigner(de1, de2).run(); + } } } @@ -690,7 +712,6 @@ namespace xt template inline void linear_assigner::run(E1& e1, const E2& e2) { -// std::cout << "lin" << std::endl; using lhs_align_mode = xt_simd::container_alignment_t; constexpr bool is_aligned = std::is_same::value; using rhs_align_mode = std::conditional_t; @@ -725,7 +746,8 @@ namespace xt i, e2.template load_simd(i) ); - }, ap + }, + ap ); } else @@ -791,7 +813,6 @@ namespace xt auto src = linear_begin(e2); auto dst = linear_begin(e1); size_type n = e1.size(); -// std::cout << "Linear assigner LOOP SIZE is " << n <(*(src + i)); - }, sp); + }, + sp + ); #elif defined(XTENSOR_USE_OPENMP) if (n >= XTENSOR_OPENMP_TRESHOLD) { #pragma omp parallel for default(none) shared(src, dst, n) - for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n) ; i++) + for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast(n); i++) { *(dst + i) = static_cast(*(src + i)); } } - else { + else + { for (; n > size_type(0); --n) { *dst = static_cast(*src); ++src; ++dst; - } + } } #else for (; n > size_type(0); --n) @@ -863,34 +887,35 @@ namespace xt } } } - - template + + template static void nth_idx(size_t n, T& outer_index, const T& outer_shape) - { - - dynamic_shape stride_sizes; - xt::resize_container(stride_sizes, outer_shape.size()); - // compute strides - using size_type = typename T::size_type; - for ( size_type i = outer_shape.size(); i>0; i--) { - stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i]; - } - - // compute index - for (size_type i=0; i < outer_shape.size(); i++) + { + dynamic_shape stride_sizes; + xt::resize_container(stride_sizes, outer_shape.size()); + // compute strides + using size_type = typename T::size_type; + for (size_type i = outer_shape.size(); i > 0; i--) + { + stride_sizes[i - 1] = (i == outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i]; + } + + // compute index + for (size_type i = 0; i < outer_shape.size(); i++) { - auto d_idx = n/stride_sizes[i]; - if (d_idx < outer_shape[i]) { - outer_index[i] = d_idx; - n -= d_idx * stride_sizes[i]; - } - else { - // that may not happen - outer_index[i] = 0; - } + auto d_idx = n / stride_sizes[i]; + if (d_idx < outer_shape[i]) + { + outer_index[i] = d_idx; + n -= d_idx * stride_sizes[i]; + } + else + { + // that may not happen + outer_index[i] = 0; + } } - } - + } }; template <> @@ -915,30 +940,34 @@ namespace xt } } } - - template + + template static void nth_idx(size_t n, T& outer_index, const T& outer_shape) - { - dynamic_shape stride_sizes; - xt::resize_container(stride_sizes, outer_shape.size()); - - using size_type = typename T::size_type; - - // compute required strides - for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; } - - // compute index - for (size_type i = outer_shape.size(); i>0;) - { - i--; - auto d_idx = n/stride_sizes[i]; - if (d_idx<(outer_shape[i]-1)) { - outer_index[i] = d_idx; - n -= d_idx * stride_sizes[i]; - } -// else {} // that may not happen + { + dynamic_shape stride_sizes; + xt::resize_container(stride_sizes, outer_shape.size()); + + using size_type = typename T::size_type; + + // compute required strides + for (size_type i = 0; i < outer_shape.size(); i++) + { + stride_sizes[i] = (i == 0) ? 1 : stride_sizes[i - 1] * outer_shape[i - 1]; + } + + // compute index + for (size_type i = outer_shape.size(); i > 0;) + { + i--; + auto d_idx = n / stride_sizes[i]; + if (d_idx < (outer_shape[i] - 1)) + { + outer_index[i] = d_idx; + n -= d_idx * stride_sizes[i]; + } + // else {} // that may not happen } - } + } }; template @@ -996,28 +1025,59 @@ namespace xt }; template - auto get_loop_sizes(const E1& e1, const E2& e2, bool is_row_major) + loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2) { + using shape_value_type = typename E1::shape_type::value_type; + bool is_row_major = true; + + // Try to find a row-major scheme first, where the outer loop is on the first N = `cut` + // dimensions, and the inner loop is + is_row_major = true; + auto is_zero = [](auto i){return i == 0;}; + auto &&strides = e1.strides(); + auto it_bwd = std::find_if_not(strides.rbegin(), strides.rend(), is_zero); + bool de1_row_contiguous = it_bwd != strides.rend() && *it_bwd == 1; + auto it_fwd = std::find_if_not(strides.begin(), strides.end(), is_zero); + bool de1_col_contiguous = it_fwd != strides.end() && *it_fwd == 1; + if (de1_row_contiguous) + { + is_row_major = true; + } + else if (de1_col_contiguous) + { + is_row_major = false; + } + else + { + // No strided loop possible. + return {true, 1, e1.size(), e1.dimension(), e1.dimension()}; + } + // Cut is the number of dimensions in the outer loop std::size_t cut = 0; - // TODO! if E1 is !contiguous --> initialize cut to sensible value! if (is_row_major) { auto csf = check_strides_functor(e1.strides()); cut = csf(e2); // This makes that only one dimension will be treated in the inner loop. - if (cut(e1.strides() - ); + auto csf = check_strides_functor(e1.strides()); cut = csf(e2); - if (cut>1) cut = 1; - } // can't reach here because this would have already triggered the fallback + if (cut > 1) + { + // Only make the inner loop go over one dimension by default for now + cut = 1; + } + } // can't reach here because this would have already triggered the fallback - using shape_value_type = typename E1::shape_type::value_type; std::size_t outer_loop_size = static_cast(std::accumulate( e1.shape().begin(), e1.shape().begin() + static_cast(cut), @@ -1036,44 +1096,28 @@ namespace xt std::swap(outer_loop_size, inner_loop_size); } - return std::make_tuple(inner_loop_size, outer_loop_size, cut); + return {is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()}; } } -#define strided_parallel_assign template template - inline void strided_loop_assigner::run(E1& e1, const E2& e2) + inline strided_assign_detail::loop_sizes_t strided_loop_assigner::get_loop_sizes(E1& e1, const E2& e2) { - bool is_row_major = true; - using fallback_assigner = stepper_assigner; - - // Try to find a row-major scheme first, where the outer loop is on the first N = `cut` dimensions, - // and the inner loop is - is_row_major = true; - bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1; - bool de1_col_contiguous = e1.strides()[0] == 1; - if (de1_row_contiguous) - is_row_major = true; - else if (de1_col_contiguous) // If first stride is 1, and we don't have a one-dimensional container - is_row_major = false; - else - // We might be dealing with a broadcasted dimension at the end. - // Since this is not detected explicitly, just try row-major by default - is_row_major = true; + return strided_assign_detail::get_loop_sizes(e1, e2); + } - // cut is the number of dimensions that the outer loop takes care of. - std::size_t inner_loop_size, outer_loop_size, cut; - std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes( - e1, - e2, - is_row_major - ); +#define strided_parallel_assign + + template + template + inline void strided_loop_assigner::run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes) + { + bool is_row_major = loop_sizes.is_row_major; + std::size_t inner_loop_size = loop_sizes.inner_loop_size; + std::size_t outer_loop_size = loop_sizes.outer_loop_size; + std::size_t cut = loop_sizes.cut; - if ((is_row_major && cut == e1.dimension()) || (!is_row_major && cut == 0)) - { - return fallback_assigner(e1, e2).run(); - } // TODO can we get rid of this and use `shape_type`? dynamic_shape idx, max_shape; @@ -1113,128 +1157,45 @@ namespace xt { step_dim = cut; } -// std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <20) { - std::size_t first_step = true; -#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) - for (std::size_t ox = 0; ox < outer_loop_size; ++ox) - { - if (first_step) { - is_row_major ? - strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape) : - strided_assign_detail::idx_tools::nth_idx(ox,idx, max_shape); - - for (std::size_t i = 0; i < idx.size(); ++i) - { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); - } - first_step = false; - } - - for (std::size_t i = 0; i < simd_size; ++i) - { - res_stepper.template store_simd(fct_stepper.template step_simd()); - } - for (std::size_t i = 0; i < simd_rest; ++i) - { - *(res_stepper) = conditional_cast(*(fct_stepper)); - res_stepper.step_leading(); - fct_stepper.step_leading(); - } - - // next unaligned index - is_row_major ? - strided_assign_detail::idx_tools::next_idx(idx, max_shape) : - strided_assign_detail::idx_tools::next_idx(idx, max_shape); - - // need to step E1 as well if not contigous assign (e.g. view) - fct_stepper.to_begin(); - res_stepper.to_begin(); - for (std::size_t i = 0; i < idx.size(); ++i) - { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); - } - } - } - else { -#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB) - if (outer_loop_size > 20) { - tbb::static_partitioner sp; - tbb::parallel_for(tbb::blocked_range(0ul, outer_loop_size), - [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_=idx](tbb::blocked_range const &r) { - auto idx = idx_; - auto fct_stepper = e2.stepper_begin(e1.shape()); - auto res_stepper = e1.stepper_begin(e1.shape()); - std::size_t first_step = true; -//#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) - for (std::size_t ox = r.begin(); ox < r.end(); ++ox) { - if (first_step) { - is_row_major ? - strided_assign_detail::idx_tools::nth_idx(ox, idx, - max_shape) : - strided_assign_detail::idx_tools::nth_idx(ox, idx, - max_shape); - - for (std::size_t i = 0; i < idx.size(); ++i) { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); - } - first_step = false; - } - - for (std::size_t i = 0; i < simd_size; ++i) { - res_stepper.template store_simd( - fct_stepper.template step_simd()); - } - for (std::size_t i = 0; i < simd_rest; ++i) { - *(res_stepper) = conditional_cast(*(fct_stepper)); - res_stepper.step_leading(); - fct_stepper.step_leading(); - } - - // next unaligned index - is_row_major ? - strided_assign_detail::idx_tools::next_idx(idx, max_shape) : - strided_assign_detail::idx_tools::next_idx(idx, max_shape); - - // need to step E1 as well if not contigous assign (e.g. view) - fct_stepper.to_begin(); - res_stepper.to_begin(); - for (std::size_t i = 0; i < idx.size(); ++i) { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); - } - } - }, sp); - } - else { - -#endif - for (std::size_t ox = 0; ox < outer_loop_size; ++ox) + if (outer_loop_size > 20) { - for (std::size_t i = 0; i < simd_size; ++i) - { - res_stepper.store_simd(fct_stepper.template step_simd()); - } - for (std::size_t i = 0; i < simd_rest; ++i) + std::size_t first_step = true; +#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) + for (std::size_t ox = 0; ox < outer_loop_size; ++ox) { - *(res_stepper) = conditional_cast(*(fct_stepper)); - res_stepper.step_leading(); - fct_stepper.step_leading(); - } + if (first_step) + { + is_row_major + ? strided_assign_detail::idx_tools::nth_idx(ox, idx, max_shape) + : strided_assign_detail::idx_tools::nth_idx(ox, idx, max_shape); + + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + first_step = false; + } - is_row_major - ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) - : strided_assign_detail::idx_tools::next_idx(idx, max_shape); + for (std::size_t i = 0; i < simd_size; ++i) + { + res_stepper.template store_simd(fct_stepper.template step_simd()); + } + for (std::size_t i = 0; i < simd_rest; ++i) + { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } - fct_stepper.to_begin(); + // next unaligned index + is_row_major + ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) + : strided_assign_detail::idx_tools::next_idx(idx, max_shape); - // need to step E1 as well if not contigous assign (e.g. view) - if (!E1::contiguous_layout) - { + // need to step E1 as well if not contigous assign (e.g. view) + fct_stepper.to_begin(); res_stepper.to_begin(); for (std::size_t i = 0; i < idx.size(); ++i) { @@ -1242,22 +1203,129 @@ namespace xt res_stepper.step(i + step_dim, idx[i]); } } - else + } + else + { +#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB) + if (outer_loop_size > 20) + { + tbb::static_partitioner sp; + tbb::parallel_for( + tbb::blocked_range(0ul, outer_loop_size), + [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_ = idx]( + const tbb::blocked_range& r + ) + { + auto idx = idx_; + auto fct_stepper = e2.stepper_begin(e1.shape()); + auto res_stepper = e1.stepper_begin(e1.shape()); + std::size_t first_step = true; + // #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, + // res_stepper, idx) + for (std::size_t ox = r.begin(); ox < r.end(); ++ox) + { + if (first_step) + { + is_row_major + ? strided_assign_detail::idx_tools::nth_idx(ox, idx, max_shape) + : strided_assign_detail::idx_tools::nth_idx( + ox, + idx, + max_shape + ); + + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + first_step = false; + } + + for (std::size_t i = 0; i < simd_size; ++i) + { + res_stepper.template store_simd( + fct_stepper.template step_simd() + ); + } + for (std::size_t i = 0; i < simd_rest; ++i) + { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } + + // next unaligned index + is_row_major + ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) + : strided_assign_detail::idx_tools::next_idx(idx, max_shape); + + // need to step E1 as well if not contigous assign (e.g. view) + fct_stepper.to_begin(); + res_stepper.to_begin(); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + } + }, + sp + ); + } + else + { + +#endif + for (std::size_t ox = 0; ox < outer_loop_size; ++ox) { - for (std::size_t i = 0; i < idx.size(); ++i) + for (std::size_t i = 0; i < simd_size; ++i) { - fct_stepper.step(i + step_dim, idx[i]); + res_stepper.store_simd(fct_stepper.template step_simd()); + } + for (std::size_t i = 0; i < simd_rest; ++i) + { + *(res_stepper) = conditional_cast(*(fct_stepper)); + res_stepper.step_leading(); + fct_stepper.step_leading(); + } + + is_row_major + ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) + : strided_assign_detail::idx_tools::next_idx(idx, max_shape); + + fct_stepper.to_begin(); + + // need to step E1 as well if not contigous assign (e.g. view) + if (!E1::contiguous_layout) + { + res_stepper.to_begin(); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + } + else + { + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + } } } - } #if (defined(XTENSOR_USE_OPENMP) || defined(XTENSOR_USE_TBB)) && defined(strided_parallel_assign) - } + } #endif - } + } template <> template - inline void strided_loop_assigner::run(E1& /*e1*/, const E2& /*e2*/) + inline void strided_loop_assigner::run( + E1& /*e1*/, + const E2& /*e2*/, + loop_sizes_t const & + ) { } } diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp index f5a79d3d7..792af6f82 100644 --- a/include/xtensor/xcontainer.hpp +++ b/include/xtensor/xcontainer.hpp @@ -924,9 +924,24 @@ namespace xt inline bool xstrided_container::is_contiguous() const noexcept { using str_type = typename inner_strides_type::value_type; - return is_contiguous_container::value - && (m_strides.empty() || (m_layout == layout_type::row_major && m_strides.back() == str_type(1)) - || (m_layout == layout_type::column_major && m_strides.front() == str_type(1))); + auto is_zero = [](auto i){return i == 0;}; + // This is unsafe if the strides are all 0. Does that happen? + if (!is_contiguous_container::value) + return false; + if (m_layout == layout_type::row_major) + { + auto it = std::find_if_not(m_strides.rbegin(), m_strides.rend(), is_zero); + // If the array has strides of zero, it is a constant, and therefore contiguous. + return it == m_strides.rend() || *it == str_type(1); + } + else if (m_layout == layout_type::column_major) + { + auto it = std::find_if_not(m_strides.begin(), m_strides.end(), is_zero); + // If the array has strides of zero, it is a constant, and therefore contiguous. + return it == m_strides.end() || *it == str_type(1); + } else { + m_strides.empty(); + } } namespace detail diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7f696e6a5..0ce881944 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -173,6 +173,7 @@ set(XTENSOR_TESTS main.cpp test_xaccumulator.cpp test_xadapt.cpp + test_strided_assign.cpp test_xassign.cpp test_xaxis_iterator.cpp test_xaxis_slice_iterator.cpp diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp new file mode 100644 index 000000000..44ab98c07 --- /dev/null +++ b/test/test_strided_assign.cpp @@ -0,0 +1,114 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht * + * Copyright (c) QuantStack * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#include +#include + +#include "xtensor/xadapt.hpp" +#include "xtensor/xarray.hpp" +#include "xtensor/xassign.hpp" +#include "xtensor/xnoalias.hpp" +#include "xtensor/xtensor.hpp" +#include "xtensor/xview.hpp" + +#include "test_common.hpp" +#include "test_common_macros.hpp" + +namespace xt +{ + TEST(xassign_strided, mix_shape_types) + { + auto check_linear_assign = [](auto a, auto b) { + assert_compatible_shape(a, b); + return xassign_traits::linear_assign(a, b, true); + }; + auto check_strided_assign = [](auto a, auto b) { + assert_compatible_shape(a, b); + static_assert(xassign_traits::strided_assign(), "Failed to do strided assign"); + return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign(); + }; + { + auto data = std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; + auto simple_xtensor_12 = xt::xtensor{{1,2,3,4},{5,6,7,8},{9,10,11,12}}; + auto simple_xtensor_16 = xt::xtensor{{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16}}; + auto adapter_strided_noncont = xt::adapt( + data.data(), + 12, + xt::no_ownership(), + std::vector{4, 3}, + std::vector{4, 1} + ); + auto adapter_strided_cont = xt::adapt( + data.data() + 1, + 12, + xt::no_ownership(), + std::vector{3, 4}, + std::vector{4, 1} + ); + auto adapter_strided_noncont_43 = xt::adapt(data.data(), 12, xt::no_ownership(), std::vector{3,4}, + std::vector{5,1}); + EXPECT_FALSE(check_strided_assign(adapter_strided_noncont, xt::transpose(adapter_strided_cont))); + auto contiguous_view = xt::view(simple_xtensor_12, xt::range(0,3), xt::all()); + EXPECT_TRUE(check_linear_assign(contiguous_view, adapter_strided_cont)); + EXPECT_FALSE(check_linear_assign(contiguous_view, xt::transpose(adapter_strided_noncont))); + EXPECT_FALSE(check_linear_assign(xt::transpose(adapter_strided_noncont), contiguous_view)); + EXPECT_FALSE(check_linear_assign(contiguous_view, adapter_strided_noncont_43)); + EXPECT_FALSE(check_linear_assign(adapter_strided_noncont_43, contiguous_view)); + EXPECT_TRUE(check_strided_assign(contiguous_view, adapter_strided_noncont_43)); + EXPECT_TRUE(check_strided_assign(adapter_strided_noncont_43, contiguous_view)); + auto contiguous_view2 = xt::view(simple_xtensor_12, xt::range(1,3), xt::all()); + auto contiguous_view3 = xt::view(simple_xtensor_12, xt::range(0,2), xt::all()); + EXPECT_TRUE(check_linear_assign(contiguous_view2, contiguous_view3)); + { + auto view_noncont = xt::view(simple_xtensor_16, xt::all(), xt::range(0,3)); + EXPECT_FALSE(check_linear_assign(adapter_strided_noncont, view_noncont)); + EXPECT_TRUE(check_strided_assign(adapter_strided_noncont, view_noncont)); + } + + { + std::vector data2{-1, -1, -1, -1}; + auto linear_adapter = xt::adapt(data2.data(), 4, xt::no_ownership(), std::vector{4, 1}); + auto adapter_cont2 = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{1, 4}); + EXPECT_TRUE(linear_adapter.is_contiguous()); + EXPECT_TRUE(adapter_cont2.is_contiguous()); + bool success_one = check_linear_assign(linear_adapter, xt::transpose(adapter_cont2)); + bool success_two = check_linear_assign(adapter_cont2, xt::transpose(linear_adapter)); + EXPECT_TRUE(success_one && success_two); + auto adapter_noncont_singlecol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{4,1},std::vector{4,1}); + EXPECT_FALSE(check_linear_assign(adapter_noncont_singlecol, linear_adapter)); + EXPECT_FALSE(check_linear_assign(linear_adapter, adapter_noncont_singlecol)); + EXPECT_FALSE(check_strided_assign(adapter_noncont_singlecol, linear_adapter)); + EXPECT_FALSE(check_strided_assign(linear_adapter, adapter_noncont_singlecol)); + auto adapter_noncont_twocol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{2,2},std::vector{4,1}); + EXPECT_FALSE(check_linear_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2}))); + EXPECT_FALSE(check_linear_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol)); + EXPECT_TRUE(check_strided_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2}))); + EXPECT_TRUE(check_strided_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol)); + auto adapter_zero_strides = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{2,2}, std::vector{0,0}); + EXPECT_FALSE(check_linear_assign(adapter_zero_strides, adapter_zero_strides)); + EXPECT_FALSE(check_strided_assign(adapter_zero_strides, adapter_zero_strides)); + EXPECT_FALSE(check_linear_assign(adapter_zero_strides, linear_adapter)); + } + + { + std::vector data2{-1, -1, -1, -1, -1, -1}; + auto linear_adapter2 = xt::adapt(data2.data(), 6, xt::no_ownership(), std::vector{3, 2}); + auto strided_adapter = xt::adapt(data.data(), 6, xt::no_ownership(), std::vector{3, 2}, + std::vector{4, 1}); + EXPECT_FALSE(check_linear_assign(linear_adapter2, strided_adapter)); + EXPECT_FALSE(check_linear_assign(strided_adapter, linear_adapter2)); + EXPECT_TRUE(check_strided_assign(linear_adapter2, strided_adapter)); + EXPECT_TRUE(check_strided_assign(strided_adapter, linear_adapter2)); + xt::noalias(strided_adapter) = linear_adapter2; + auto result_expected = std::vector{-1, -1, 3, 4, -1, -1, 7, 8, -1, -1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; + EXPECT_EQ(data, result_expected); + } + } + } +} From 1cc5b59521dd40510429864767b2903eb1800185 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Thu, 23 Feb 2023 18:32:28 +0100 Subject: [PATCH 17/33] Revert some unnecessary changes again --- include/xtensor/xstrides.hpp | 2 -- include/xtensor/xview.hpp | 64 ++++++++++++++---------------------- test/test_strided_assign.cpp | 9 +++++ 3 files changed, 33 insertions(+), 42 deletions(-) diff --git a/include/xtensor/xstrides.hpp b/include/xtensor/xstrides.hpp index 99ffa811c..ae261e354 100644 --- a/include/xtensor/xstrides.hpp +++ b/include/xtensor/xstrides.hpp @@ -544,10 +544,8 @@ namespace xt { for (std::size_t i = strides.size(); i != 0; --i) { -// std::cout << "matching: " << strides[i - 1] << " && " << shape[i - 1]<< " && " << data_size << std::endl; if (!stride_match_condition(strides[i - 1], shape[i - 1], data_size, zero_strides)) { -// std::cout << "non-matching: " << strides[i - 1] << " != " << data_size << std::endl; return false; } data_size *= static_cast(shape[i - 1]); diff --git a/include/xtensor/xview.hpp b/include/xtensor/xview.hpp index 52b7c33c0..0a2dbb6be 100644 --- a/include/xtensor/xview.hpp +++ b/include/xtensor/xview.hpp @@ -184,23 +184,15 @@ namespace xt ? is_all_slice : (!range_seen && (is_int_slice || is_range_slice))); - static constexpr bool value = is_contiguous_view_impl>::value; - static constexpr size_t contiguous_dimensions = is_contiguous_view_impl>::contiguous_dimensions + (value ? 1 :0); + static constexpr bool value = is_contiguous_view_impl < layout_type::row_major, is_valid, + have_all_seen, range_seen || is_range_slice, + xtl::mpl::pop_front_t < V >> ::value; }; template struct is_contiguous_view_impl> { static constexpr bool value = valid; - static constexpr size_t contiguous_dimensions = 0; }; // For column major the *same* but reverse is true -- with the additional @@ -984,41 +976,33 @@ namespace xt return xtl::mpl::static_if( [&](auto self) { - return self(this)->m_e.layout()==layout_type::row_major ? - ((self(this)->strides()[self(this)->strides().size()-1] == 1) ? self(this)->m_e.layout() : layout_type::dynamic) : - ((self(this)->strides()[0] == 1) ? self(this)->m_e.layout() : layout_type::dynamic); - - bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true); - return strides_match ? self(this)->m_e.layout() : layout_type::dynamic; + if (static_layout != layout_type::dynamic) + { + return static_layout; + } + else + { + bool strides_match = do_strides_match( + self(this)->shape(), + self(this)->strides(), + self(this)->m_e.layout(), + true + ); + return strides_match ? self(this)->m_e.layout() : layout_type::dynamic; + } }, - /* else */ [&](auto /*self*/) - { - std::cout << "View is not strided!" << std::endl; - return layout_type::dynamic; - }); + /* else */ + [&](auto /*self*/) + { + return layout_type::dynamic; + } + ); } template inline bool xview::is_contiguous() const noexcept { - return xtl::mpl::static_if([&](auto self) - { - if (static_layout != layout_type::dynamic) - { - return true; - } - else - { - bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true); - return strides_match ? self(this)->m_e.layout()!=layout_type::dynamic : false; - } - }, - /* else */ [&](auto /*self*/) - { - return false; - }); - -// return layout() != layout_type::dynamic; + return layout() != layout_type::dynamic; } //@} diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp index 44ab98c07..c89e5cecd 100644 --- a/test/test_strided_assign.cpp +++ b/test/test_strided_assign.cpp @@ -70,6 +70,15 @@ namespace xt EXPECT_FALSE(check_linear_assign(adapter_strided_noncont, view_noncont)); EXPECT_TRUE(check_strided_assign(adapter_strided_noncont, view_noncont)); } + { + auto view_cont = xt::view(simple_xtensor_16, xt::range(0,1), xt::range(0,4)); + auto view_cont2 = xt::view(simple_xtensor_16, xt::range(3,4), xt::range(0,4)); + EXPECT_TRUE(check_linear_assign(view_cont, view_cont2)); + auto view_noncont = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(0,3)); + auto view_noncont2 = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(1,4)); + EXPECT_FALSE(check_linear_assign(view_noncont, view_noncont2)); + EXPECT_TRUE(check_strided_assign(view_noncont, view_noncont2)); + } { std::vector data2{-1, -1, -1, -1}; From 011d8433aa90244353c433f37181ef953b273512 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Sun, 26 Feb 2023 17:34:23 +0100 Subject: [PATCH 18/33] Add tbb to benchmark --- benchmark/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 51228749f..6b98e63f8 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -136,6 +136,15 @@ set(XTENSOR_BENCHMARK_TARGET benchmark_xtensor) add_executable(${XTENSOR_BENCHMARK_TARGET} EXCLUDE_FROM_ALL ${XTENSOR_BENCHMARK} ${XTENSOR_HEADERS}) target_link_libraries(${XTENSOR_BENCHMARK_TARGET} xtensor ${GBENCHMARK_LIBRARIES}) +if(XTENSOR_USE_TBB) + target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_TBB) + target_include_directories(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_INCLUDE_DIRS}) + target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_LIBRARIES}) +endif() +if(XTENSOR_USE_OPENMP) + target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_OPENMP) +endif() + add_custom_target(xbenchmark COMMAND benchmark_xtensor DEPENDS ${XTENSOR_BENCHMARK_TARGET}) From 7c179a5d2931b8b31548eb86784bdc2e65b6ba16 Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sat, 25 Feb 2023 23:39:59 +0100 Subject: [PATCH 19/33] Fix typo --- include/xtensor/xcontainer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp index 792af6f82..41c40be1c 100644 --- a/include/xtensor/xcontainer.hpp +++ b/include/xtensor/xcontainer.hpp @@ -940,7 +940,7 @@ namespace xt // If the array has strides of zero, it is a constant, and therefore contiguous. return it == m_strides.end() || *it == str_type(1); } else { - m_strides.empty(); + return m_strides.empty(); } } From 4c49f91a1c6e43d0e1020fa7cceea255ffdfee2e Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sun, 26 Feb 2023 17:12:40 +0100 Subject: [PATCH 20/33] Fix a few things and refactor to amke things backward-compatible --- include/xtensor/xassign.hpp | 80 ++++++++++++++++++--------- test/test_strided_assign.cpp | 103 ++++++++++++++++++++++++----------- test/test_xassign.cpp | 10 ++++ 3 files changed, 135 insertions(+), 58 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 4a68827b6..dc3302495 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -172,29 +172,32 @@ namespace xt * strided_loop_assigner * *************************/ - namespace strided_assign_detail { - struct loop_sizes_t { + namespace strided_assign_detail + { + struct loop_sizes_t + { + bool can_do_strided_assign; bool is_row_major; std::size_t inner_loop_size; std::size_t outer_loop_size; std::size_t cut; std::size_t dimension; - inline bool can_do_strided_assign() { // Is it actually useful to do a strided assign? - return inner_loop_size > 1; - //return !((is_row_major && cut == dimension) || (!is_row_major && cut == 0)) && inner_loop_size != 1; - }; }; } + template class strided_loop_assigner { public: + using loop_sizes_t = strided_assign_detail::loop_sizes_t; // is_row_major, inner_loop_size, outer_loop_size, cut template - static void run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes); + static void run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes); template static loop_sizes_t get_loop_sizes(E1& e1, const E2& e2); + template + static void run(E1& e1, const E2& e2); }; /*********************************** @@ -471,11 +474,7 @@ namespace xt { if (simd_strided_assign) { - strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner::get_loop_sizes(de1, de2); - if (loop_sizes.can_do_strided_assign()) - strided_loop_assigner::run(de1, de2, loop_sizes); - else - stepper_assigner(de1, de2).run(); + strided_loop_assigner::run(de1, de2); } else { @@ -1024,7 +1023,13 @@ namespace xt const strides_type& m_strides; }; - template + template ::value || !has_strides::value || !possible, bool> = true> + loop_sizes_t get_loop_sizes(const E1& e1, const E2&) + { + return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()}; + } + + template ::value && has_strides::value && possible, bool> = true> loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2) { using shape_value_type = typename E1::shape_type::value_type; @@ -1033,8 +1038,11 @@ namespace xt // Try to find a row-major scheme first, where the outer loop is on the first N = `cut` // dimensions, and the inner loop is is_row_major = true; - auto is_zero = [](auto i){return i == 0;}; - auto &&strides = e1.strides(); + auto is_zero = [](auto i) + { + return i == 0; + }; + auto&& strides = e1.strides(); auto it_bwd = std::find_if_not(strides.rbegin(), strides.rend(), is_zero); bool de1_row_contiguous = it_bwd != strides.rend() && *it_bwd == 1; auto it_fwd = std::find_if_not(strides.begin(), strides.end(), is_zero); @@ -1050,7 +1058,7 @@ namespace xt else { // No strided loop possible. - return {true, 1, e1.size(), e1.dimension(), e1.dimension()}; + return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()}; } // Cut is the number of dimensions in the outer loop @@ -1069,7 +1077,8 @@ namespace xt } else if (!is_row_major) { - auto csf = check_strides_functor(e1.strides()); + auto csf = check_strides_functor(e1.strides() + ); cut = csf(e2); if (cut > 1) { @@ -1096,7 +1105,7 @@ namespace xt std::swap(outer_loop_size, inner_loop_size); } - return {is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()}; + return {inner_loop_size > 1, is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()}; } } @@ -1104,14 +1113,14 @@ namespace xt template inline strided_assign_detail::loop_sizes_t strided_loop_assigner::get_loop_sizes(E1& e1, const E2& e2) { - return strided_assign_detail::get_loop_sizes(e1, e2); + return strided_assign_detail::get_loop_sizes(e1, e2); } #define strided_parallel_assign template template - inline void strided_loop_assigner::run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes) + inline void strided_loop_assigner::run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes) { bool is_row_major = loop_sizes.is_row_major; std::size_t inner_loop_size = loop_sizes.inner_loop_size; @@ -1244,7 +1253,7 @@ namespace xt for (std::size_t i = 0; i < simd_size; ++i) { - res_stepper.template store_simd( + res_stepper.template store_simd( fct_stepper.template step_simd() ); } @@ -1318,15 +1327,34 @@ namespace xt } #endif } + template <> + template + inline void strided_loop_assigner::run(E1& e1, const E2& e2) + { + strided_assign_detail::loop_sizes_t + loop_sizes = strided_loop_assigner::get_loop_sizes(e1, e2); + if (loop_sizes.can_do_strided_assign) + { + run(e1, e2, loop_sizes); + } + else + { + // trigger the fallback assigner + stepper_assigner(e1, e2).run(); + } + } template <> template - inline void strided_loop_assigner::run( - E1& /*e1*/, - const E2& /*e2*/, - loop_sizes_t const & - ) + inline void strided_loop_assigner::run(E1& /*e1*/, const E2& /*e2*/, const loop_sizes_t&) + { + } + template <> + template + inline void strided_loop_assigner::run(E1& e1, const E2& e2) { + // trigger the fallback assigner + stepper_assigner(e1, e2).run(); } } diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp index c89e5cecd..4c91534fb 100644 --- a/test/test_strided_assign.cpp +++ b/test/test_strided_assign.cpp @@ -24,19 +24,29 @@ namespace xt { TEST(xassign_strided, mix_shape_types) { - auto check_linear_assign = [](auto a, auto b) { + auto check_linear_assign = [](auto a, auto b) + { assert_compatible_shape(a, b); return xassign_traits::linear_assign(a, b, true); }; - auto check_strided_assign = [](auto a, auto b) { + auto check_strided_assign = [](auto a, auto b) + { assert_compatible_shape(a, b); - static_assert(xassign_traits::strided_assign(), "Failed to do strided assign"); - return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign(); + static_assert( + xassign_traits::strided_assign(), + "Failed to do strided assign" + ); + return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign; }; { - auto data = std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; - auto simple_xtensor_12 = xt::xtensor{{1,2,3,4},{5,6,7,8},{9,10,11,12}}; - auto simple_xtensor_16 = xt::xtensor{{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16}}; + auto data = std::vector{ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; + auto simple_xtensor_12 = xt::xtensor{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}; + auto simple_xtensor_16 = xt::xtensor{ + {1, 2, 3, 4}, + {5, 6, 7, 8}, + {9, 10, 11, 12}, + {13, 14, 15, 16}}; auto adapter_strided_noncont = xt::adapt( data.data(), 12, @@ -45,16 +55,21 @@ namespace xt std::vector{4, 1} ); auto adapter_strided_cont = xt::adapt( - data.data() + 1, - 12, - xt::no_ownership(), - std::vector{3, 4}, - std::vector{4, 1} + data.data() + 1, + 12, + xt::no_ownership(), + std::vector{3, 4}, + std::vector{4, 1} + ); + auto adapter_strided_noncont_43 = xt::adapt( + data.data(), + 12, + xt::no_ownership(), + std::vector{3, 4}, + std::vector{5, 1} ); - auto adapter_strided_noncont_43 = xt::adapt(data.data(), 12, xt::no_ownership(), std::vector{3,4}, - std::vector{5,1}); EXPECT_FALSE(check_strided_assign(adapter_strided_noncont, xt::transpose(adapter_strided_cont))); - auto contiguous_view = xt::view(simple_xtensor_12, xt::range(0,3), xt::all()); + auto contiguous_view = xt::view(simple_xtensor_12, xt::range(0, 3), xt::all()); EXPECT_TRUE(check_linear_assign(contiguous_view, adapter_strided_cont)); EXPECT_FALSE(check_linear_assign(contiguous_view, xt::transpose(adapter_strided_noncont))); EXPECT_FALSE(check_linear_assign(xt::transpose(adapter_strided_noncont), contiguous_view)); @@ -62,20 +77,20 @@ namespace xt EXPECT_FALSE(check_linear_assign(adapter_strided_noncont_43, contiguous_view)); EXPECT_TRUE(check_strided_assign(contiguous_view, adapter_strided_noncont_43)); EXPECT_TRUE(check_strided_assign(adapter_strided_noncont_43, contiguous_view)); - auto contiguous_view2 = xt::view(simple_xtensor_12, xt::range(1,3), xt::all()); - auto contiguous_view3 = xt::view(simple_xtensor_12, xt::range(0,2), xt::all()); + auto contiguous_view2 = xt::view(simple_xtensor_12, xt::range(1, 3), xt::all()); + auto contiguous_view3 = xt::view(simple_xtensor_12, xt::range(0, 2), xt::all()); EXPECT_TRUE(check_linear_assign(contiguous_view2, contiguous_view3)); { - auto view_noncont = xt::view(simple_xtensor_16, xt::all(), xt::range(0,3)); + auto view_noncont = xt::view(simple_xtensor_16, xt::all(), xt::range(0, 3)); EXPECT_FALSE(check_linear_assign(adapter_strided_noncont, view_noncont)); EXPECT_TRUE(check_strided_assign(adapter_strided_noncont, view_noncont)); } { - auto view_cont = xt::view(simple_xtensor_16, xt::range(0,1), xt::range(0,4)); - auto view_cont2 = xt::view(simple_xtensor_16, xt::range(3,4), xt::range(0,4)); + auto view_cont = xt::view(simple_xtensor_16, xt::range(0, 1), xt::range(0, 4)); + auto view_cont2 = xt::view(simple_xtensor_16, xt::range(3, 4), xt::range(0, 4)); EXPECT_TRUE(check_linear_assign(view_cont, view_cont2)); - auto view_noncont = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(0,3)); - auto view_noncont2 = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(1,4)); + auto view_noncont = xt::view(simple_xtensor_16, xt::range(0, 4), xt::range(0, 3)); + auto view_noncont2 = xt::view(simple_xtensor_16, xt::range(0, 4), xt::range(1, 4)); EXPECT_FALSE(check_linear_assign(view_noncont, view_noncont2)); EXPECT_TRUE(check_strided_assign(view_noncont, view_noncont2)); } @@ -89,17 +104,35 @@ namespace xt bool success_one = check_linear_assign(linear_adapter, xt::transpose(adapter_cont2)); bool success_two = check_linear_assign(adapter_cont2, xt::transpose(linear_adapter)); EXPECT_TRUE(success_one && success_two); - auto adapter_noncont_singlecol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{4,1},std::vector{4,1}); + auto adapter_noncont_singlecol = xt::adapt( + data.data(), + 4, + xt::no_ownership(), + std::vector{4, 1}, + std::vector{4, 1} + ); EXPECT_FALSE(check_linear_assign(adapter_noncont_singlecol, linear_adapter)); EXPECT_FALSE(check_linear_assign(linear_adapter, adapter_noncont_singlecol)); EXPECT_FALSE(check_strided_assign(adapter_noncont_singlecol, linear_adapter)); EXPECT_FALSE(check_strided_assign(linear_adapter, adapter_noncont_singlecol)); - auto adapter_noncont_twocol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{2,2},std::vector{4,1}); - EXPECT_FALSE(check_linear_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2}))); - EXPECT_FALSE(check_linear_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol)); - EXPECT_TRUE(check_strided_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2}))); - EXPECT_TRUE(check_strided_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol)); - auto adapter_zero_strides = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{2,2}, std::vector{0,0}); + auto adapter_noncont_twocol = xt::adapt( + data.data(), + 4, + xt::no_ownership(), + std::vector{2, 2}, + std::vector{4, 1} + ); + EXPECT_FALSE(check_linear_assign(adapter_noncont_twocol, linear_adapter.reshape({2, 2}))); + EXPECT_FALSE(check_linear_assign(linear_adapter.reshape({2, 2}), adapter_noncont_twocol)); + EXPECT_TRUE(check_strided_assign(adapter_noncont_twocol, linear_adapter.reshape({2, 2}))); + EXPECT_TRUE(check_strided_assign(linear_adapter.reshape({2, 2}), adapter_noncont_twocol)); + auto adapter_zero_strides = xt::adapt( + data.data(), + 4, + xt::no_ownership(), + std::vector{2, 2}, + std::vector{0, 0} + ); EXPECT_FALSE(check_linear_assign(adapter_zero_strides, adapter_zero_strides)); EXPECT_FALSE(check_strided_assign(adapter_zero_strides, adapter_zero_strides)); EXPECT_FALSE(check_linear_assign(adapter_zero_strides, linear_adapter)); @@ -108,14 +141,20 @@ namespace xt { std::vector data2{-1, -1, -1, -1, -1, -1}; auto linear_adapter2 = xt::adapt(data2.data(), 6, xt::no_ownership(), std::vector{3, 2}); - auto strided_adapter = xt::adapt(data.data(), 6, xt::no_ownership(), std::vector{3, 2}, - std::vector{4, 1}); + auto strided_adapter = xt::adapt( + data.data(), + 6, + xt::no_ownership(), + std::vector{3, 2}, + std::vector{4, 1} + ); EXPECT_FALSE(check_linear_assign(linear_adapter2, strided_adapter)); EXPECT_FALSE(check_linear_assign(strided_adapter, linear_adapter2)); EXPECT_TRUE(check_strided_assign(linear_adapter2, strided_adapter)); EXPECT_TRUE(check_strided_assign(strided_adapter, linear_adapter2)); xt::noalias(strided_adapter) = linear_adapter2; - auto result_expected = std::vector{-1, -1, 3, 4, -1, -1, 7, 8, -1, -1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; + auto result_expected = std::vector{ + -1, -1, 3, 4, -1, -1, 7, 8, -1, -1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; EXPECT_EQ(data, result_expected); } } diff --git a/test/test_xassign.cpp b/test/test_xassign.cpp index 475d0e5cf..4af38bcf1 100644 --- a/test/test_xassign.cpp +++ b/test/test_xassign.cpp @@ -9,6 +9,7 @@ #include #include +#include #include "xtensor/xarray.hpp" #include "xtensor/xassign.hpp" @@ -82,6 +83,15 @@ class my_vector return m_data.begin(); } + auto rbegin() const + { + return std::make_reverse_iterator(end()); + } + auto rend() const + { + return std::make_reverse_iterator(begin()); + } + auto empty() const { return m_data.empty(); From 0b3caffb8b3a8363cc322e21d009b3a3540d0578 Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sun, 26 Feb 2023 17:49:49 +0100 Subject: [PATCH 21/33] Use tbb and opnemp also in benchmark --- benchmark/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 6b98e63f8..412d03644 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -134,15 +134,15 @@ set(XTENSOR_BENCHMARK set(XTENSOR_BENCHMARK_TARGET benchmark_xtensor) add_executable(${XTENSOR_BENCHMARK_TARGET} EXCLUDE_FROM_ALL ${XTENSOR_BENCHMARK} ${XTENSOR_HEADERS}) -target_link_libraries(${XTENSOR_BENCHMARK_TARGET} xtensor ${GBENCHMARK_LIBRARIES}) +target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC xtensor ${GBENCHMARK_LIBRARIES}) if(XTENSOR_USE_TBB) - target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_TBB) - target_include_directories(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_INCLUDE_DIRS}) - target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_LIBRARIES}) + target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_USE_TBB) + target_include_directories(${XTENSOR_BENCHMARK_TARGET} PUBLIC ${TBB_INCLUDE_DIRS}) + target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC ${TBB_LIBRARIES}) endif() if(XTENSOR_USE_OPENMP) - target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_OPENMP) + target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_USE_OPENMP) endif() add_custom_target(xbenchmark From 609d8489b0f98058f5ae91da894de33ff127e37b Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sun, 26 Feb 2023 17:50:03 +0100 Subject: [PATCH 22/33] Use the various THRESHOLDs for the strided loops --- include/xtensor/xassign.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index dc3302495..68835f047 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1167,7 +1167,7 @@ namespace xt step_dim = cut; } #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign) - if (outer_loop_size > 20) + if (outer_loop_size >= XTENSOR_OPENMP_THRESHOLD / inner_loop_size) { std::size_t first_step = true; #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) @@ -1216,7 +1216,7 @@ namespace xt else { #elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB) - if (outer_loop_size > 20) + if (outer_loop_size > XTENSOR_TBB_THRESHOLD / inner_loop_size) { tbb::static_partitioner sp; tbb::parallel_for( From 6a9d20513cada231e941c867766cc3d0955ccf11 Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sun, 26 Feb 2023 18:32:11 +0100 Subject: [PATCH 23/33] Add benchmark for stencil operations --- benchmark/benchmark_views.cpp | 48 +++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/benchmark/benchmark_views.cpp b/benchmark/benchmark_views.cpp index e0f9067d3..13d21366a 100644 --- a/benchmark/benchmark_views.cpp +++ b/benchmark/benchmark_views.cpp @@ -195,6 +195,54 @@ namespace xt BENCHMARK_TEMPLATE(view_assign_strided_view_noalias, float); } + namespace finite_diff + { + inline auto stencil_threedirections(benchmark::State& state, size_t size) + { + for (auto _ : state) + { + const std::array shape = {size, size, size}; + xt::xtensor a(shape), b(shape); + auto core = xt::range(1, size - 1); + xt::noalias(xt::view(b, core, core, core) + ) = 1.0 / 7.0 + * (xt::view(a, core, core, core) + xt::view(a, core, core, xt::range(2, size)) + + xt::view(a, core, core, xt::range(0, size - 2)) + + xt::view(a, core, xt::range(2, size), core) + + xt::view(a, core, xt::range(0, size - 2), core) + + xt::view(a, xt::range(2, size), core, core) + + xt::view(a, xt::range(0, size - 2), core, core)); + benchmark::DoNotOptimize(b); + } + } + + inline auto stencil_onedirection(benchmark::State& state, size_t size) + { + for (auto _ : state) + { + const std::array shape = {size, size, size}; + xt::xtensor a(shape), b(shape); + auto core = xt::range(1, size - 1); + xt::noalias(xt::view(b, core, core, core) + ) = 1.0 / 2.0 + * (xt::view(a, xt::range(2, size), core, core) + - xt::view(a, xt::range(0, size - 2), core, core)); + benchmark::DoNotOptimize(b); + } + } + + BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_50, 50); + BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_100, 100); + BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_200, 200); + BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_300, 300); + BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_500, 500); + BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_50, 50); + BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_100, 100); + BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_200, 200); + BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_300, 300); + BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_500, 500); + } + namespace stridedview { From 4299283443c3f28ec17e4eec706063bfea8ae997 Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sun, 26 Feb 2023 18:36:29 +0100 Subject: [PATCH 24/33] Add two-direction stencil too --- benchmark/benchmark_views.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/benchmark/benchmark_views.cpp b/benchmark/benchmark_views.cpp index 13d21366a..db233ec05 100644 --- a/benchmark/benchmark_views.cpp +++ b/benchmark/benchmark_views.cpp @@ -216,6 +216,23 @@ namespace xt } } + inline auto stencil_twodirections(benchmark::State& state, size_t size) + { + for (auto _ : state) + { + const std::array shape = {size, size, size}; + xt::xtensor a(shape), b(shape); + auto core = xt::range(1, size - 1); + xt::noalias(xt::view(b, core, core, core) + ) = 1.0 / 7.0 + * (xt::view(a, core, core, core) + xt::view(a, core, xt::range(2, size), core) + + xt::view(a, core, xt::range(0, size - 2), core) + + xt::view(a, xt::range(2, size), core, core) + + xt::view(a, xt::range(0, size - 2), core, core)); + benchmark::DoNotOptimize(b); + } + } + inline auto stencil_onedirection(benchmark::State& state, size_t size) { for (auto _ : state) @@ -236,6 +253,11 @@ namespace xt BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_200, 200); BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_300, 300); BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_500, 500); + BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_50, 50); + BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_100, 100); + BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_200, 200); + BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_300, 300); + BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_500, 500); BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_50, 50); BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_100, 100); BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_200, 200); From f66ab3f745d6d57a908344e99f6475e43ce0aa3c Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sun, 26 Feb 2023 18:54:08 +0100 Subject: [PATCH 25/33] Add more elaborate view test --- test/test_strided_assign.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp index 4c91534fb..b8cea8571 100644 --- a/test/test_strided_assign.cpp +++ b/test/test_strided_assign.cpp @@ -38,6 +38,20 @@ namespace xt ); return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign; }; + { + size_t size = 50; + const std::array shape = {size, size, size}; + xt::xtensor a(shape), b(shape); + auto core = xt::range(1, size - 1); + auto lhs = xt::view(b, core, core, core); + auto rhs = 1.0 / 7.0 + * (xt::view(a, core, core, core) + xt::view(a, core, xt::range(2, size), core) + + xt::view(a, core, xt::range(0, size - 2), core) + + xt::view(a, xt::range(2, size), core, core) + + xt::view(a, xt::range(0, size - 2), core, core)); + xt::noalias(lhs) = rhs; + EXPECT_TRUE(check_strided_assign(lhs, rhs)); + } { auto data = std::vector{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; From 32595af23114618b17491dad64fa081c751917b7 Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Sun, 26 Feb 2023 23:57:48 +0100 Subject: [PATCH 26/33] Remove some buggy checks --- include/xtensor/xassign.hpp | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 68835f047..48385cce6 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -903,16 +903,8 @@ namespace xt for (size_type i = 0; i < outer_shape.size(); i++) { auto d_idx = n / stride_sizes[i]; - if (d_idx < outer_shape[i]) - { - outer_index[i] = d_idx; - n -= d_idx * stride_sizes[i]; - } - else - { - // that may not happen - outer_index[i] = 0; - } + outer_index[i] = d_idx; + n -= d_idx * stride_sizes[i]; } } }; @@ -959,12 +951,8 @@ namespace xt { i--; auto d_idx = n / stride_sizes[i]; - if (d_idx < (outer_shape[i] - 1)) - { - outer_index[i] = d_idx; - n -= d_idx * stride_sizes[i]; - } - // else {} // that may not happen + outer_index[i] = d_idx; + n -= d_idx * stride_sizes[i]; } } }; @@ -1023,13 +1011,13 @@ namespace xt const strides_type& m_strides; }; - template ::value || !has_strides::value || !possible, bool> = true> + template ::value || !possible, bool> = true> loop_sizes_t get_loop_sizes(const E1& e1, const E2&) { return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()}; } - template ::value && has_strides::value && possible, bool> = true> + template ::value && possible, bool> = true> loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2) { using shape_value_type = typename E1::shape_type::value_type; From df7abfb9010cc38eed6458526f6f526f4d75cc75 Mon Sep 17 00:00:00 2001 From: Ewoud Wempe Date: Mon, 27 Feb 2023 00:06:35 +0100 Subject: [PATCH 27/33] Fix a few more bugs --- include/xtensor/xassign.hpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 48385cce6..2566bde17 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1155,7 +1155,7 @@ namespace xt step_dim = cut; } #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign) - if (outer_loop_size >= XTENSOR_OPENMP_THRESHOLD / inner_loop_size) + if (outer_loop_size >= XTENSOR_OPENMP_TRESHOLD / inner_loop_size) { std::size_t first_step = true; #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx) @@ -1241,9 +1241,7 @@ namespace xt for (std::size_t i = 0; i < simd_size; ++i) { - res_stepper.template store_simd( - fct_stepper.template step_simd() - ); + res_stepper.template store_simd(fct_stepper.template step_simd()); } for (std::size_t i = 0; i < simd_rest; ++i) { @@ -1257,13 +1255,24 @@ namespace xt ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) : strided_assign_detail::idx_tools::next_idx(idx, max_shape); - // need to step E1 as well if not contigous assign (e.g. view) fct_stepper.to_begin(); - res_stepper.to_begin(); - for (std::size_t i = 0; i < idx.size(); ++i) + + // need to step E1 as well if not contigous assign (e.g. view) + if (!E1::contiguous_layout) { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); + res_stepper.to_begin(); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + } + else + { + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + } } } }, @@ -1315,12 +1324,12 @@ namespace xt } #endif } + template <> template inline void strided_loop_assigner::run(E1& e1, const E2& e2) { - strided_assign_detail::loop_sizes_t - loop_sizes = strided_loop_assigner::get_loop_sizes(e1, e2); + strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner::get_loop_sizes(e1, e2); if (loop_sizes.can_do_strided_assign) { run(e1, e2, loop_sizes); From b2cea0084c907038e74ad1fe6c7219db9feec9cc Mon Sep 17 00:00:00 2001 From: Ewoud Date: Mon, 27 Feb 2023 14:20:11 +0100 Subject: [PATCH 28/33] run clang-format on xcontainer --- include/xtensor/xcontainer.hpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp index 41c40be1c..26045f6b6 100644 --- a/include/xtensor/xcontainer.hpp +++ b/include/xtensor/xcontainer.hpp @@ -924,10 +924,15 @@ namespace xt inline bool xstrided_container::is_contiguous() const noexcept { using str_type = typename inner_strides_type::value_type; - auto is_zero = [](auto i){return i == 0;}; + auto is_zero = [](auto i) + { + return i == 0; + }; // This is unsafe if the strides are all 0. Does that happen? if (!is_contiguous_container::value) + { return false; + } if (m_layout == layout_type::row_major) { auto it = std::find_if_not(m_strides.rbegin(), m_strides.rend(), is_zero); @@ -939,7 +944,9 @@ namespace xt auto it = std::find_if_not(m_strides.begin(), m_strides.end(), is_zero); // If the array has strides of zero, it is a constant, and therefore contiguous. return it == m_strides.end() || *it == str_type(1); - } else { + } + else + { return m_strides.empty(); } } From d84724e02754f4f1fa99026858c8261b5413b816 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Mon, 27 Feb 2023 14:20:41 +0100 Subject: [PATCH 29/33] Run clang-format on xassign & test_xassign --- include/xtensor/xassign.hpp | 1 + test/test_xassign.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 2566bde17..74e307956 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1346,6 +1346,7 @@ namespace xt inline void strided_loop_assigner::run(E1& /*e1*/, const E2& /*e2*/, const loop_sizes_t&) { } + template <> template inline void strided_loop_assigner::run(E1& e1, const E2& e2) diff --git a/test/test_xassign.cpp b/test/test_xassign.cpp index 4af38bcf1..5367a32da 100644 --- a/test/test_xassign.cpp +++ b/test/test_xassign.cpp @@ -7,9 +7,9 @@ * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ +#include #include #include -#include #include "xtensor/xarray.hpp" #include "xtensor/xassign.hpp" @@ -87,6 +87,7 @@ class my_vector { return std::make_reverse_iterator(end()); } + auto rend() const { return std::make_reverse_iterator(begin()); From 68a5a9efbae2c150c65f11437578a7002a523186 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Mon, 27 Feb 2023 14:42:16 +0100 Subject: [PATCH 30/33] Fix for column layout default --- test/test_strided_assign.cpp | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp index b8cea8571..800764258 100644 --- a/test/test_strided_assign.cpp +++ b/test/test_strided_assign.cpp @@ -13,6 +13,7 @@ #include "xtensor/xadapt.hpp" #include "xtensor/xarray.hpp" #include "xtensor/xassign.hpp" +#include "xtensor/xlayout.hpp" #include "xtensor/xnoalias.hpp" #include "xtensor/xtensor.hpp" #include "xtensor/xview.hpp" @@ -55,8 +56,11 @@ namespace xt { auto data = std::vector{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; - auto simple_xtensor_12 = xt::xtensor{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}; - auto simple_xtensor_16 = xt::xtensor{ + auto simple_xtensor_12 = xt::xtensor{ + {1, 2, 3, 4}, + {5, 6, 7, 8}, + {9, 10, 11, 12}}; + auto simple_xtensor_16 = xt::xtensor{ {1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, @@ -111,8 +115,18 @@ namespace xt { std::vector data2{-1, -1, -1, -1}; - auto linear_adapter = xt::adapt(data2.data(), 4, xt::no_ownership(), std::vector{4, 1}); - auto adapter_cont2 = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector{1, 4}); + auto linear_adapter = xt::adapt( + data2.data(), + 4, + xt::no_ownership(), + std::vector{4, 1} + ); + auto adapter_cont2 = xt::adapt( + data.data(), + 4, + xt::no_ownership(), + std::vector{1, 4} + ); EXPECT_TRUE(linear_adapter.is_contiguous()); EXPECT_TRUE(adapter_cont2.is_contiguous()); bool success_one = check_linear_assign(linear_adapter, xt::transpose(adapter_cont2)); @@ -154,7 +168,12 @@ namespace xt { std::vector data2{-1, -1, -1, -1, -1, -1}; - auto linear_adapter2 = xt::adapt(data2.data(), 6, xt::no_ownership(), std::vector{3, 2}); + auto linear_adapter2 = xt::adapt( + data2.data(), + 6, + xt::no_ownership(), + std::vector{3, 2} + ); auto strided_adapter = xt::adapt( data.data(), 6, From 36483d68b4cfa7c0c85383bd4d7a9664eb5b235c Mon Sep 17 00:00:00 2001 From: Ewoud Date: Mon, 27 Feb 2023 14:55:22 +0100 Subject: [PATCH 31/33] Fix openmp version --- include/xtensor/xassign.hpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 74e307956..0eb601400 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -1177,7 +1177,7 @@ namespace xt for (std::size_t i = 0; i < simd_size; ++i) { - res_stepper.template store_simd(fct_stepper.template step_simd()); + res_stepper.template store_simd(fct_stepper.template step_simd()); } for (std::size_t i = 0; i < simd_rest; ++i) { @@ -1191,13 +1191,24 @@ namespace xt ? strided_assign_detail::idx_tools::next_idx(idx, max_shape) : strided_assign_detail::idx_tools::next_idx(idx, max_shape); - // need to step E1 as well if not contigous assign (e.g. view) fct_stepper.to_begin(); - res_stepper.to_begin(); - for (std::size_t i = 0; i < idx.size(); ++i) + + // need to step E1 as well if not contigous assign (e.g. view) + if (!E1::contiguous_layout) { - fct_stepper.step(i + step_dim, idx[i]); - res_stepper.step(i + step_dim, idx[i]); + res_stepper.to_begin(); + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + res_stepper.step(i + step_dim, idx[i]); + } + } + else + { + for (std::size_t i = 0; i < idx.size(); ++i) + { + fct_stepper.step(i + step_dim, idx[i]); + } } } } From 2f678c1544378004eb37bc27419c8de2ebd1f645 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Mon, 27 Feb 2023 15:37:03 +0100 Subject: [PATCH 32/33] Try to fix the VS2015 test on appveyor --- test/test_strided_assign.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp index 800764258..e10ba74c2 100644 --- a/test/test_strided_assign.cpp +++ b/test/test_strided_assign.cpp @@ -33,10 +33,12 @@ namespace xt auto check_strided_assign = [](auto a, auto b) { assert_compatible_shape(a, b); +#ifndef _WIN32 static_assert( xassign_traits::strided_assign(), "Failed to do strided assign" ); +#endif return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign; }; { From 3bf9d1b688c8665cf543b8f48bb8a6b828df12d9 Mon Sep 17 00:00:00 2001 From: Ewoud Date: Thu, 16 Mar 2023 16:24:14 +0100 Subject: [PATCH 33/33] Remove no longer relevant comment and refactor a few lines --- include/xtensor/xassign.hpp | 13 +++++-------- include/xtensor/xcontainer.hpp | 3 ++- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp index 0eb601400..9461320a8 100644 --- a/include/xtensor/xassign.hpp +++ b/include/xtensor/xassign.hpp @@ -470,16 +470,13 @@ namespace xt linear_assigner::run(de1, de2); } } + else if (simd_strided_assign) + { + strided_loop_assigner::run(de1, de2); + } else { - if (simd_strided_assign) - { - strided_loop_assigner::run(de1, de2); - } - else - { - stepper_assigner(de1, de2).run(); - } + stepper_assigner(de1, de2).run(); } } diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp index 26045f6b6..d4621672d 100644 --- a/include/xtensor/xcontainer.hpp +++ b/include/xtensor/xcontainer.hpp @@ -928,11 +928,12 @@ namespace xt { return i == 0; }; - // This is unsafe if the strides are all 0. Does that happen? if (!is_contiguous_container::value) { return false; } + // We need to make sure the inner-most non-zero stride is one. + // Trailing zero strides are ignored because they indicate bradcasted dimensions. if (m_layout == layout_type::row_major) { auto it = std::find_if_not(m_strides.rbegin(), m_strides.rend(), is_zero);