From e3ab51704449ed804a52b97aeef57e369234e425 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern.starruss@tu-dresden.de>
Date: Thu, 4 Feb 2021 13:09:55 +0100
Subject: [PATCH 01/33] initial thoughts parallel strided assign

---
 include/xtensor/xassign.hpp | 231 +++++++++++++++++++++++++++++++++++-
 1 file changed, 229 insertions(+), 2 deletions(-)
diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 1c5535113..79db1b1f5 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -850,6 +850,80 @@ namespace xt
                     }
                 }
             }
+                     
+            template <class T>
+            static void nth_next_idx(size_t n, T&outer_index, T& outer_shape)
+			{
+				auto i = outer_index.size()-1;
+				auto stride_sizes = full_like(outer_shape,0);
+				stride_sizes[i] = 1;
+				// fill up dimensions in the outer_idx as required
+				for (; i >= 0; i--)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
+						outer_index[i] += d_idx;
+						n-=d_idx * stride_sizes[i];
+						// we filled up all dimensions required
+						break;
+					}
+					else if (outer_index[i] !=0) {
+						outer_index[i] = 0;
+						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
+						if (i>0) {
+							outer_index[i-1]++;
+							// compute stride sizes on the fly ...
+							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+					else {
+						if (i>0) {
+							// compute stride sizes on the fly ...
+							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+                }
+                i++;
+				// add remaining iterations
+				for (; i < outer_index.size(); i++)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i])) {
+						outer_index[i] += d_idx;
+						n-=d_idx * stride_sizes[i];
+					}
+					else {
+						// that may not happen
+					}
+                }
+			}
+			
+			template <class T>
+            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
+			{
+				// assume outer_index is filled with 0
+				auto stride_sizes = full_like(outer_shape,0);
+				
+				auto i = outer_index.size()-1;
+				stride_sizes[i] = 1;
+				i--
+				// compute strides
+				for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; }
+				
+				// compute index
+				for (i=0; i < outer_index.size(); i++)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx < outer_shape[i]) {
+						outer_index[i] += d_idx;
+						n -= d_idx * stride_sizes[i];
+					}
+					else {
+						// that may not happen
+					}
+                }
+			}
+		
         };
 
         template <>
@@ -874,6 +948,83 @@ namespace xt
                     }
                 }
             }
+            
+            			template <class T>
+            static void nth_next_idx(size_t n, T& outer_index, T& outer_shape)
+			{
+				using size_type = typename T::size_type;
+				size_type i = 0;
+				auto stride_sizes = outer_shape;
+				stride_sizes[i] = 1;
+				// fill up dimensions in the outer_idx as required
+				for (; i < outer_index.size(); i++)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i])) {
+						outer_index[i] += d_idx;
+						n-=d_idx * stride_sizes[i];
+						// we filled up all dimensions required
+						break;
+					}
+					else if (outer_index[i] != 0) {
+						outer_index[i] = 0;
+						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
+						
+						if (i+1<outer_index.size()) {
+							// compute stride sizes on the fly ...
+							outer_index[i+1]++;
+							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+					else {
+						if (i+1<outer_index.size()) {
+							// compute stride sizes on the fly ...
+							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+                }
+                i--;
+				// add remaining iterations
+				for (; i >= 0; i--)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
+						outer_index[i] += d_idx;
+						n -= d_idx * stride_sizes[i];
+					}
+					else {
+						// that may not happen
+					}
+                }
+			}
+			
+			template <class T>
+            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
+			{
+				// assume outer_index is filled with 0
+				using size_type = typename T::size_type;
+				auto stride_sizes = outer_shape;
+				size_type i = 0;
+				stride_sizes[i] = 1;
+				i++;
+				
+				// compute required strides
+				for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; }
+				
+				// compute index
+                i = outer_shape.size()-1;
+				for (; i >= 0; i--)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-1)) {
+						outer_index[i] += d_idx;
+						n -= d_idx * stride_sizes[i];
+					}
+					else {
+						// that may not happen
+					}
+                }
+			}
         };
 
         template <layout_type L, class S>
@@ -969,6 +1120,8 @@ namespace xt
         }
     }
 
+#define strided_row_major_fallback
+#define strided_parallel_assign
     template <bool simd>
     template <class E1, class E2>
     inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2)
@@ -988,7 +1141,11 @@ namespace xt
                     is_row_major = false;
                     break;
                 default:
+#ifdef strided_row_major_fallback
+					is_row_major = true;
+#else
                     return fallback_assigner(e1, e2).run();
+#endif
             }
         }
         else if (E1::static_layout == layout_type::row_major)
@@ -1054,7 +1211,75 @@ namespace xt
         {
             step_dim = cut;
         }
-
+#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
+		static bool reported = false;
+		if (!reported) {
+			std::cout << "SIMD size is " << simd_type::size << " step_dim " << step_dim << std::endl;
+			reported = true;
+		}
+		std::cout << "+";
+		std::size_t first_step = true;
+// #define old_parallel
+#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
+        for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
+        {
+			if (first_step) {
+				is_row_major ?
+					strided_assign_detail::idx_tools<layout_type::row_major>::nth_next_idx(ox,idx, max_shape) :
+					strided_assign_detail::idx_tools<layout_type::column_major>::nth_next_idx(ox,idx, max_shape);
+				for (std::size_t i = 0; i < idx.size(); ++i)
+                {
+                    fct_stepper.step(i + step_dim, idx[i]);
+                    res_stepper.step(i + step_dim, idx[i]);
+                }
+                first_step = false;
+			}
+			else {
+				// next unaligned index
+				if (is_row_major) {
+					auto i = max_shape.size()-1;
+// 					fct_stepper.step_back(i+1,inner_loop_size);
+// 					res_stepper.step_back(i+1,inner_loop_size);
+					fct_stepper.reset(i+1);
+					res_stepper.reset(i+1);
+					fct_stepper.step_back(i+1,1);
+					res_stepper.step_back(i+1,1);
+					
+			
+					for (; i >= 0; --i)
+					{
+						if (idx[i] + 1 >= max_shape[i])
+						{
+							idx[i] = 0;
+							fct_stepper.reset(i);
+							res_stepper.reset(i);
+						}
+						else
+						{
+							idx[i]++;
+							fct_stepper.step(i, 1);
+							res_stepper.step(i, 1);
+							break;
+						}
+					}
+				}
+				else {
+					std::cout << "oops" << std::endl;
+				}
+// 				std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<<idx.size() << "," << cut << std::endl;
+			}
+			for (std::size_t i = 0; i < simd_size; ++i)
+			{
+				res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
+			}
+			for (std::size_t i = 0; i < simd_rest; ++i)
+			{
+				*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+				res_stepper.step_leading();
+				fct_stepper.step_leading();
+			}
+        }
+#else
         for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
         {
             for (std::size_t i = 0; i < simd_size; ++i)
@@ -1092,7 +1317,9 @@ namespace xt
                 }
             }
         }
-    }
+
+#endif
+	}
 
     template <>
     template <class E1, class E2>

From 2087ed04b4e7b5865805be9908f9ea454b9f28f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern.starruss@tu-dresden.de>
Date: Thu, 27 May 2021 23:30:24 +0200
Subject: [PATCH 02/33] trigger strided assign properly

---
 include/xtensor/xassign.hpp  | 568 +++++++++--------------------
 include/xtensor/xstrides.hpp |   2 +
 include/xtensor/xview.hpp    | 686 ++++++++++++++++-------------------
 3 files changed, 488 insertions(+), 768 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 79db1b1f5..7c67da273 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1,30 +1,30 @@
 /***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
- * Copyright (c) QuantStack                                                 *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
+* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
+* Copyright (c) QuantStack                                                 *
+*                                                                          *
+* Distributed under the terms of the BSD 3-Clause License.                 *
+*                                                                          *
+* The full license is in the file LICENSE, distributed with this software. *
+****************************************************************************/
 
 #ifndef XTENSOR_ASSIGN_HPP
 #define XTENSOR_ASSIGN_HPP
 
 #include <algorithm>
-#include <functional>
 #include <type_traits>
 #include <utility>
+#include <functional>
 
 #include <xtl/xcomplex.hpp>
 #include <xtl/xsequence.hpp>
 
 #include "xexpression.hpp"
-#include "xfunction.hpp"
 #include "xiterator.hpp"
 #include "xstrides.hpp"
 #include "xtensor_config.hpp"
 #include "xtensor_forward.hpp"
 #include "xutils.hpp"
+#include "xfunction.hpp"
 
 #if defined(XTENSOR_USE_TBB)
 #include <tbb/tbb.h>
@@ -100,6 +100,7 @@ namespace xt
 
         template <class E1, class F, class... CT>
         static bool resize(E1& e1, const xfunction<F, CT...>& e2);
+
     };
 
     /********************
@@ -195,18 +196,14 @@ namespace xt
     template <class E1, class E2>
     inline void assign_xexpression(xexpression<E1>& e1, const xexpression<E2>& e2)
     {
-        xtl::mpl::static_if<has_assign_to<E1, E2>::value>(
-            [&](auto self)
-            {
-                self(e2).derived_cast().assign_to(e1);
-            },
-            /*else*/
-            [&](auto /*self*/)
-            {
-                using tag = xexpression_tag_t<E1, E2>;
-                xexpression_assigner<tag>::assign_xexpression(e1, e2);
-            }
-        );
+        xtl::mpl::static_if<has_assign_to<E1, E2>::value>([&](auto self)
+        {
+            self(e2).derived_cast().assign_to(e1);
+        }, /*else*/ [&](auto /*self*/)
+        {
+            using tag = xexpression_tag_t<E1, E2>;
+            xexpression_assigner<tag>::assign_xexpression(e1, e2);
+        });
     }
 
     template <class E1, class E2>
@@ -242,19 +239,15 @@ namespace xt
             // A row_major or column_major container with a dimension <= 1 is computed as
             // layout any, leading to some performance improvements, for example when
             // assigning a col-major vector to a row-major vector etc
-            return compute_layout(
-                       select_layout<E1::static_layout, typename E1::shape_type>::value,
-                       select_layout<E2::static_layout, typename E2::shape_type>::value
-                   )
-                   != layout_type::dynamic;
+            return compute_layout(select_layout<E1::static_layout, typename E1::shape_type>::value,
+                                  select_layout<E2::static_layout, typename E2::shape_type>::value) != layout_type::dynamic;
         }
 
         template <class E1, class E2>
-        inline auto is_linear_assign(const E1& e1, const E2& e2)
-            -> std::enable_if_t<has_strides<E1>::value, bool>
+        inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t<has_strides<E1>::value, bool>
         {
-            return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>())
-                   || (e1.layout() != layout_type::dynamic && e2.has_linear_assign(e1.strides()));
+            return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>()) ||
+                   (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
         }
 
         template <class E1, class E2>
@@ -266,8 +259,9 @@ namespace xt
         template <class E1, class E2>
         inline bool linear_dynamic_layout(const E1& e1, const E2& e2)
         {
-            return e1.is_contiguous() && e2.is_contiguous()
-                   && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic;
+            return e1.is_contiguous()
+                && e2.is_contiguous()
+                && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic;
         }
 
         template <class E, class = void>
@@ -276,20 +270,16 @@ namespace xt
         };
 
         template <class E>
-        struct has_step_leading<E, void_t<decltype(std::declval<E>().step_leading())>> : std::true_type
+        struct has_step_leading<E, void_t<decltype(std::declval<E>().step_leading())>>
+            : std::true_type
         {
         };
 
         template <class T>
         struct use_strided_loop
         {
-            static constexpr bool stepper_deref()
-            {
-                return std::is_reference<typename T::stepper::reference>::value;
-            }
-
-            static constexpr bool value = has_strides<T>::value
-                                          && has_step_leading<typename T::stepper>::value && stepper_deref();
+            static constexpr bool stepper_deref() { return std::is_reference<typename T::stepper::reference>::value; }
+            static constexpr bool value = has_strides<T>::value && has_step_leading<typename T::stepper>::value && stepper_deref();
         };
 
         template <class T>
@@ -343,89 +333,43 @@ namespace xt
         template <class T>
         using is_bool = std::is_same<T, bool>;
 
-        static constexpr bool is_bool_conversion()
-        {
-            return is_bool<e2_value_type>::value && !is_bool<e1_value_type>::value;
-        }
-
-        static constexpr bool contiguous_layout()
-        {
-            return E1::contiguous_layout && E2::contiguous_layout;
-        }
+        static constexpr bool is_bool_conversion() { return is_bool<e2_value_type>::value && !is_bool<e1_value_type>::value; }
+        static constexpr bool contiguous_layout() { return E1::contiguous_layout && E2::contiguous_layout; }
+        static constexpr bool convertible_types() { return std::is_convertible<e2_value_type, e1_value_type>::value
+                                                        && !is_bool_conversion(); }
 
-        static constexpr bool convertible_types()
-        {
-            return std::is_convertible<e2_value_type, e1_value_type>::value && !is_bool_conversion();
-        }
-
-        static constexpr bool use_xsimd()
-        {
-            return xt_simd::simd_traits<int8_t>::size > 1;
-        }
+        static constexpr bool use_xsimd() { return xt_simd::simd_traits<int8_t>::size > 1; }
 
         template <class T>
-        static constexpr bool simd_size_impl()
-        {
-            return xt_simd::simd_traits<T>::size > 1 || (is_bool<T>::value && use_xsimd());
-        }
-
-        static constexpr bool simd_size()
-        {
-            return simd_size_impl<e1_value_type>() && simd_size_impl<e2_value_type>();
-        }
-
-        static constexpr bool simd_interface()
-        {
-            return has_simd_interface<E1, requested_value_type>()
-                   && has_simd_interface<E2, requested_value_type>();
-        }
+        static constexpr bool simd_size_impl() { return xt_simd::simd_traits<T>::size > 1 || (is_bool<T>::value && use_xsimd()); }
+        static constexpr bool simd_size() { return simd_size_impl<e1_value_type>() && simd_size_impl<e2_value_type>(); }
+        static constexpr bool simd_interface() { return has_simd_interface<E1, requested_value_type>() &&
+                                                        has_simd_interface<E2, requested_value_type>(); }
 
     public:
 
         // constexpr methods instead of constexpr data members avoid the need of definitions at namespace
         // scope of these data members (since they are odr-used).
 
-        static constexpr bool simd_assign()
-        {
-            return convertible_types() && simd_size() && simd_interface();
-        }
-
-        static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial)
-        {
-            return trivial && detail::is_linear_assign(e1, e2);
-        }
-
-        static constexpr bool strided_assign()
-        {
-            return detail::use_strided_loop<E1>::value && detail::use_strided_loop<E2>::value;
-        }
+        static constexpr bool simd_assign() { return convertible_types() && simd_size() && simd_interface(); }
+        static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial) { return trivial && detail::is_linear_assign(e1, e2); }
+        static constexpr bool strided_assign() { return detail::use_strided_loop<E1>::value && detail::use_strided_loop<E2>::value; }
+        static constexpr bool simd_linear_assign() { return contiguous_layout() && simd_assign(); }
+        static constexpr bool simd_strided_assign() { return strided_assign() && simd_assign(); }
 
-        static constexpr bool simd_linear_assign()
-        {
-            return contiguous_layout() && simd_assign();
-        }
-
-        static constexpr bool simd_strided_assign()
-        {
-            return strided_assign() && simd_assign();
-        }
+        static constexpr bool simd_linear_assign(const E1& e1, const E2& e2) { return simd_assign()
+                                                                                && detail::linear_dynamic_layout(e1, e2); }
 
-        static constexpr bool simd_linear_assign(const E1& e1, const E2& e2)
-        {
-            return simd_assign() && detail::linear_dynamic_layout(e1, e2);
-        }
+        using e2_requested_value_type = std::conditional_t<is_bool<e2_value_type>::value,
+                                                           typename E2::bool_load_type,
+                                                           e2_value_type>;
+        using requested_value_type = detail::conditional_promote_to_complex_t<e1_value_type,
+                                                                              e2_requested_value_type>;
 
-        using e2_requested_value_type = std::
-            conditional_t<is_bool<e2_value_type>::value, typename E2::bool_load_type, e2_value_type>;
-        using requested_value_type = detail::conditional_promote_to_complex_t<e1_value_type, e2_requested_value_type>;
     };
 
     template <class E1, class E2>
-    inline void xexpression_assigner_base<xtensor_expression_tag>::assign_data(
-        xexpression<E1>& e1,
-        const xexpression<E2>& e2,
-        bool trivial
-    )
+    inline void xexpression_assigner_base<xtensor_expression_tag>::assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial)
     {
         E1& de1 = e1.derived_cast();
         const E2& de2 = e2.derived_cast();
@@ -437,7 +381,7 @@ namespace xt
         constexpr bool simd_strided_assign = traits::simd_strided_assign();
         if (linear_assign)
         {
-            if (simd_linear_assign || traits::simd_linear_assign(de1, de2))
+            if(simd_linear_assign || traits::simd_linear_assign(de1, de2))
             {
                 // Do not use linear_assigner<true> here since it will make the compiler
                 // instantiate this branch even if the runtime condition is false, resulting
@@ -451,7 +395,7 @@ namespace xt
                 linear_assigner<false>::run(de1, de2);
             }
         }
-        else if (simd_strided_assign)
+        else if (simd_strided_assign && de1.layout() == de2.layout())
         {
             strided_loop_assigner<simd_strided_assign>::run(de1, de2);
         }
@@ -487,15 +431,8 @@ namespace xt
 
         bool trivial_broadcast = de2.broadcast_shape(shape, true);
 
-        auto&& de1_shape = de1.shape();
-        if (dim2 > de1.dimension()
-            || std::lexicographical_compare(
-                shape.begin(),
-                shape.end(),
-                de1_shape.begin(),
-                de1_shape.end(),
-                comperator_type()
-            ))
+        auto && de1_shape = de1.shape();
+        if (dim2 > de1.dimension() || std::lexicographical_compare(shape.begin(), shape.end(), de1_shape.begin(), de1_shape.end(), comperator_type()))
         {
             typename E1::temporary_type tmp(shape);
             base_type::assign_data(tmp, e2, trivial_broadcast);
@@ -523,8 +460,7 @@ namespace xt
 
     template <class Tag>
     template <class E1, class E2>
-    inline void
-    xexpression_assigner<Tag>::assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
+    inline void xexpression_assigner<Tag>::assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
     {
         const E1& de1 = e1.derived_cast();
         const E2& de2 = e2.derived_cast();
@@ -568,20 +504,16 @@ namespace xt
     inline bool xexpression_assigner<Tag>::resize(E1& e1, const xfunction<F, CT...>& e2)
     {
         return xtl::mpl::static_if<detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value>(
-            [&](auto /*self*/)
-            {
+            [&](auto /*self*/) {
                 /*
                  * If the shape of the xfunction is statically known, we can compute the broadcast triviality
                  * at compile time plus we can resize right away.
                  */
                 // resize in case LHS is not a fixed size container. If it is, this is a NOP
                 e1.resize(typename xfunction<F, CT...>::shape_type{});
-                return detail::static_trivial_broadcast<
-                    detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value,
-                    CT...>::value;
+                return detail::static_trivial_broadcast<detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value, CT...>::value;
             },
-            /* else */
-            [&](auto /*self*/)
+            /* else */ [&](auto /*self*/)
             {
                 using index_type = xindex_type_t<typename E1::shape_type>;
                 using size_type = typename E1::size_type;
@@ -604,10 +536,9 @@ namespace xt
         using argument_type = std::decay_t<FROM>;
         using result_type = std::decay_t<TO>;
 
-        static const bool value = xtl::is_arithmetic<result_type>::value
-                                  && (sizeof(result_type) < sizeof(argument_type)
-                                      || (xtl::is_integral<result_type>::value
-                                          && std::is_floating_point<argument_type>::value));
+        static const bool value = xtl::is_arithmetic<result_type>::value &&
+            (sizeof(result_type) < sizeof(argument_type) ||
+             (xtl::is_integral<result_type>::value && std::is_floating_point<argument_type>::value));
     };
 
     template <class FROM, class TO>
@@ -625,16 +556,15 @@ namespace xt
         using argument_type = std::decay_t<FROM>;
         using result_type = std::decay_t<TO>;
 
-        static const bool value = is_narrowing_conversion<argument_type, result_type>::value
-                                  || has_sign_conversion<argument_type, result_type>::value;
+        static const bool value = is_narrowing_conversion<argument_type, result_type>::value ||
+                                  has_sign_conversion<argument_type, result_type>::value;
     };
 
     template <class E1, class E2, layout_type L>
     inline stepper_assigner<E1, E2, L>::stepper_assigner(E1& e1, const E2& e2)
-        : m_e1(e1)
-        , m_lhs(e1.stepper_begin(e1.shape()))
-        , m_rhs(e2.stepper_begin(e1.shape()))
-        , m_index(xtl::make_sequence<index_type>(e1.shape().size(), size_type(0)))
+        : m_e1(e1), m_lhs(e1.stepper_begin(e1.shape())),
+          m_rhs(e2.stepper_begin(e1.shape())),
+          m_index(xtl::make_sequence<index_type>(e1.shape().size(), size_type(0)))
     {
     }
 
@@ -690,6 +620,7 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<simd_assign>::run(E1& e1, const E2& e2)
     {
+// 		std::cout << "lin" << std::endl;
         using lhs_align_mode = xt_simd::container_alignment_t<E1>;
         constexpr bool is_aligned = std::is_same<lhs_align_mode, aligned_mode>::value;
         using rhs_align_mode = std::conditional_t<is_aligned, inner_aligned_mode, unaligned_mode>;
@@ -713,18 +644,10 @@ namespace xt
 #if defined(XTENSOR_USE_TBB)
         if (size >= XTENSOR_TBB_THRESHOLD)
         {
-            tbb::parallel_for(
-                align_begin,
-                align_end,
-                simd_size,
-                [&e1, &e2](size_t i)
-                {
-                    e1.template store_simd<lhs_align_mode>(
-                        i,
-                        e2.template load_simd<rhs_align_mode, value_type>(i)
-                    );
-                }
-            );
+            tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i)
+            {
+                e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
+            });
         }
         else
         {
@@ -736,20 +659,20 @@ namespace xt
 #elif defined(XTENSOR_USE_OPENMP)
         if (size >= XTENSOR_OPENMP_TRESHOLD)
         {
-#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
-#ifndef _WIN32
+            #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
+    #ifndef _WIN32
             for (size_type i = align_begin; i < align_end; i += simd_size)
             {
                 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
             }
-#else
-            for (auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
-                 i += static_cast<std::ptrdiff_t>(simd_size))
+    #else
+            for(auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
+                i += static_cast<std::ptrdiff_t>(simd_size))
             {
                 size_type ui = static_cast<size_type>(i);
                 e1.template store_simd<lhs_align_mode>(ui, e2.template load_simd<rhs_align_mode, value_type>(ui));
             }
-#endif
+    #endif
         }
         else
         {
@@ -773,8 +696,8 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<false>::run(E1& e1, const E2& e2)
     {
-        using is_convertible = std::
-            is_convertible<typename std::decay_t<E2>::value_type, typename std::decay_t<E1>::value_type>;
+        using is_convertible = std::is_convertible<typename std::decay_t<E2>::value_type,
+                                                   typename std::decay_t<E1>::value_type>;
         // If the types are not compatible, this function is still instantiated but never called.
         // To avoid compilation problems in effectively unused code trivial_assigner_run_impl is
         // empty in this case.
@@ -789,19 +712,15 @@ namespace xt
         auto src = linear_begin(e2);
         auto dst = linear_begin(e1);
         size_type n = e1.size();
-
+// 		std::cout << "Linear assigner LOOP SIZE is " << n <<std::endl;
 #if defined(XTENSOR_USE_TBB)
-        tbb::parallel_for(
-            std::ptrdiff_t(0),
-            static_cast<std::ptrdiff_t>(n),
-            [&](std::ptrdiff_t i)
-            {
-                *(dst + i) = static_cast<value_type>(*(src + i));
-            }
-        );
+        tbb::parallel_for(std::ptrdiff_t(0), static_cast<std::ptrdiff_t>(n), [&](std::ptrdiff_t i)
+        {
+            *(dst + i) = static_cast<value_type>(*(src + i));
+        });
 #elif defined(XTENSOR_USE_OPENMP)
-#pragma omp parallel for default(none) shared(src, dst, n)
-        for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n); i++)
+        #pragma omp parallel for default(none) shared(src, dst, n)
+        for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
         {
             *(dst + i) = static_cast<value_type>(*(src + i));
         }
@@ -818,7 +737,9 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<false>::run_impl(E1&, const E2&, std::false_type /*is_convertible*/)
     {
-        XTENSOR_PRECONDITION(false, "Internal error: linear_assigner called with unrelated types.");
+        XTENSOR_PRECONDITION(false,
+            "Internal error: linear_assigner called with unrelated types.");
+
     }
 
     /****************************************
@@ -851,75 +772,29 @@ namespace xt
                 }
             }
                      
-            template <class T>
-            static void nth_next_idx(size_t n, T&outer_index, T& outer_shape)
-			{
-				auto i = outer_index.size()-1;
-				auto stride_sizes = full_like(outer_shape,0);
-				stride_sizes[i] = 1;
-				// fill up dimensions in the outer_idx as required
-				for (; i >= 0; i--)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
-						outer_index[i] += d_idx;
-						n-=d_idx * stride_sizes[i];
-						// we filled up all dimensions required
-						break;
-					}
-					else if (outer_index[i] !=0) {
-						outer_index[i] = 0;
-						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
-						if (i>0) {
-							outer_index[i-1]++;
-							// compute stride sizes on the fly ...
-							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-					else {
-						if (i>0) {
-							// compute stride sizes on the fly ...
-							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-                }
-                i++;
-				// add remaining iterations
-				for (; i < outer_index.size(); i++)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i])) {
-						outer_index[i] += d_idx;
-						n-=d_idx * stride_sizes[i];
-					}
-					else {
-						// that may not happen
-					}
-                }
-			}
-			
 			template <class T>
-            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
+            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
 			{
-				// assume outer_index is filled with 0
-				auto stride_sizes = full_like(outer_shape,0);
 				
-				auto i = outer_index.size()-1;
-				stride_sizes[i] = 1;
-				i--
+				dynamic_shape<std::size_t> stride_sizes;
+				xt::resize_container(stride_sizes, outer_shape.size());
 				// compute strides
-				for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; }
+				using size_type = typename T::size_type;
+				for ( size_type i = outer_shape.size(); i>0; i--) {
+					stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
+				}
 				
 				// compute index
-				for (i=0; i < outer_index.size(); i++)
+				for (size_type i=0; i < outer_shape.size(); i++)
                 {
 					auto d_idx = n/stride_sizes[i];
 					if (d_idx < outer_shape[i]) {
-						outer_index[i] += d_idx;
+						outer_index[i] = d_idx;
 						n -= d_idx * stride_sizes[i];
 					}
 					else {
 						// that may not happen
+						outer_index[i] = 0;
 					}
                 }
 			}
@@ -949,80 +824,27 @@ namespace xt
                 }
             }
             
-            			template <class T>
-            static void nth_next_idx(size_t n, T& outer_index, T& outer_shape)
-			{
-				using size_type = typename T::size_type;
-				size_type i = 0;
-				auto stride_sizes = outer_shape;
-				stride_sizes[i] = 1;
-				// fill up dimensions in the outer_idx as required
-				for (; i < outer_index.size(); i++)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i])) {
-						outer_index[i] += d_idx;
-						n-=d_idx * stride_sizes[i];
-						// we filled up all dimensions required
-						break;
-					}
-					else if (outer_index[i] != 0) {
-						outer_index[i] = 0;
-						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
-						
-						if (i+1<outer_index.size()) {
-							// compute stride sizes on the fly ...
-							outer_index[i+1]++;
-							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-					else {
-						if (i+1<outer_index.size()) {
-							// compute stride sizes on the fly ...
-							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-                }
-                i--;
-				// add remaining iterations
-				for (; i >= 0; i--)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
-						outer_index[i] += d_idx;
-						n -= d_idx * stride_sizes[i];
-					}
-					else {
-						// that may not happen
-					}
-                }
-			}
-			
 			template <class T>
-            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
+            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
 			{
-				// assume outer_index is filled with 0
+				dynamic_shape<std::size_t> stride_sizes;
+				xt::resize_container(stride_sizes, outer_shape.size());
+				
 				using size_type = typename T::size_type;
-				auto stride_sizes = outer_shape;
-				size_type i = 0;
-				stride_sizes[i] = 1;
-				i++;
 				
 				// compute required strides
-				for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; }
+				for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; }
 				
 				// compute index
-                i = outer_shape.size()-1;
-				for (; i >= 0; i--)
-                {
+				for (size_type i = outer_shape.size(); i>0;)
+				{
+					i--;
 					auto d_idx = n/stride_sizes[i];
 					if (d_idx<(outer_shape[i]-1)) {
-						outer_index[i] += d_idx;
+						outer_index[i] = d_idx;
 						n -= d_idx * stride_sizes[i];
 					}
-					else {
-						// that may not happen
-					}
+// 					else {}   // that may not happen
                 }
 			}
         };
@@ -1033,13 +855,14 @@ namespace xt
             using strides_type = S;
 
             check_strides_functor(const S& strides)
-                : m_cut(L == layout_type::row_major ? 0 : strides.size())
-                , m_strides(strides)
+                : m_cut(L == layout_type::row_major ? 0 : strides.size()),
+                  m_strides(strides)
             {
             }
 
             template <class T, layout_type LE = L>
-            std::enable_if_t<LE == layout_type::row_major, std::size_t> operator()(const T& el)
+            std::enable_if_t<LE == layout_type::row_major, std::size_t>
+            operator()(const T& el)
             {
                 auto var = check_strides_overlap<layout_type::row_major>::get(m_strides, el.strides());
                 if (var > m_cut)
@@ -1050,7 +873,8 @@ namespace xt
             }
 
             template <class T, layout_type LE = L>
-            std::enable_if_t<LE == layout_type::column_major, std::size_t> operator()(const T& el)
+            std::enable_if_t<LE == layout_type::column_major, std::size_t>
+            operator()(const T& el)
             {
                 auto var = check_strides_overlap<layout_type::column_major>::get(m_strides, el.strides());
                 if (var < m_cut)
@@ -1089,27 +913,22 @@ namespace xt
             {
                 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
+				if (cut<e1.strides().size()-1) cut = e1.strides().size()-1;
             }
             else if (E1::static_layout == layout_type::column_major || !is_row_major)
             {
-                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
-                );
+                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
-            }  // can't reach here because this would have already triggered the fallback
+				if (cut>1) cut = 1;
+            } // can't reach here because this would have already triggered the fallback
 
             using shape_value_type = typename E1::shape_type::value_type;
-            std::size_t outer_loop_size = static_cast<std::size_t>(std::accumulate(
-                e1.shape().begin(),
-                e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
-                shape_value_type(1),
-                std::multiplies<shape_value_type>{}
-            ));
-            std::size_t inner_loop_size = static_cast<std::size_t>(std::accumulate(
-                e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
-                e1.shape().end(),
-                shape_value_type(1),
-                std::multiplies<shape_value_type>{}
-            ));
+            std::size_t outer_loop_size = static_cast<std::size_t>(
+                            std::accumulate(e1.shape().begin(), e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
+                                shape_value_type(1), std::multiplies<shape_value_type>{}));
+            std::size_t inner_loop_size = static_cast<std::size_t>(
+                            std::accumulate(e1.shape().begin() + static_cast<std::ptrdiff_t>(cut), e1.shape().end(),
+                                shape_value_type(1), std::multiplies<shape_value_type>{}));
 
             if (E1::static_layout == layout_type::column_major || !is_row_major)
             {
@@ -1120,7 +939,6 @@ namespace xt
         }
     }
 
-#define strided_row_major_fallback
 #define strided_parallel_assign
     template <bool simd>
     template <class E1, class E2>
@@ -1128,10 +946,10 @@ namespace xt
     {
         bool is_row_major = true;
         using fallback_assigner = stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>;
-
         if (E1::static_layout == layout_type::dynamic)
         {
             layout_type dynamic_layout = e1.layout();
+
             switch (dynamic_layout)
             {
                 case layout_type::row_major:
@@ -1141,11 +959,7 @@ namespace xt
                     is_row_major = false;
                     break;
                 default:
-#ifdef strided_row_major_fallback
-					is_row_major = true;
-#else
                     return fallback_assigner(e1, e2).run();
-#endif
             }
         }
         else if (E1::static_layout == layout_type::row_major)
@@ -1162,11 +976,7 @@ namespace xt
         }
 
         std::size_t inner_loop_size, outer_loop_size, cut;
-        std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(
-            e1,
-            e2,
-            is_row_major
-        );
+        std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(e1, e2, is_row_major);
 
         if ((is_row_major && cut == e1.dimension()) || (!is_row_major && cut == 0))
         {
@@ -1193,10 +1003,9 @@ namespace xt
         using e2_value_type = typename E2::value_type;
         constexpr bool needs_cast = has_assign_conversion<e1_value_type, e2_value_type>::value;
         using value_type = typename xassign_traits<E1, E2>::requested_value_type;
-        using simd_type = std::conditional_t<
-            std::is_same<e1_value_type, bool>::value,
-            xt_simd::simd_bool_type<value_type>,
-            xt_simd::simd_type<value_type>>;
+        using simd_type = std::conditional_t<std::is_same<e1_value_type, bool>::value,
+                                             xt_simd::simd_bool_type<value_type>,
+                                             xt_simd::simd_type<value_type>>;
 
         std::size_t simd_size = inner_loop_size / simd_type::size;
         std::size_t simd_rest = inner_loop_size % simd_type::size;
@@ -1207,79 +1016,59 @@ namespace xt
         // TODO in 1D case this is ambigous -- could be RM or CM.
         //      Use default layout to make decision
         std::size_t step_dim = 0;
-        if (!is_row_major)  // row major case
+        if (!is_row_major) // row major case
         {
             step_dim = cut;
         }
+// 		std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <<std::endl;
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-		static bool reported = false;
-		if (!reported) {
-			std::cout << "SIMD size is " << simd_type::size << " step_dim " << step_dim << std::endl;
-			reported = true;
-		}
-		std::cout << "+";
-		std::size_t first_step = true;
-// #define old_parallel
+		if (outer_loop_size>50) {
+			std::size_t first_step = true;
 #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
-        for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
-        {
-			if (first_step) {
-				is_row_major ?
-					strided_assign_detail::idx_tools<layout_type::row_major>::nth_next_idx(ox,idx, max_shape) :
-					strided_assign_detail::idx_tools<layout_type::column_major>::nth_next_idx(ox,idx, max_shape);
-				for (std::size_t i = 0; i < idx.size(); ++i)
-                {
-                    fct_stepper.step(i + step_dim, idx[i]);
-                    res_stepper.step(i + step_dim, idx[i]);
-                }
-                first_step = false;
-			}
-			else {
-				// next unaligned index
-				if (is_row_major) {
-					auto i = max_shape.size()-1;
-// 					fct_stepper.step_back(i+1,inner_loop_size);
-// 					res_stepper.step_back(i+1,inner_loop_size);
-					fct_stepper.reset(i+1);
-					res_stepper.reset(i+1);
-					fct_stepper.step_back(i+1,1);
-					res_stepper.step_back(i+1,1);
+			for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
+			{
+				if (first_step) {
+					is_row_major ?
+						strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox,idx, max_shape) :
+						strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox,idx, max_shape);
 					
-			
-					for (; i >= 0; --i)
+					for (std::size_t i = 0; i < idx.size(); ++i)
 					{
-						if (idx[i] + 1 >= max_shape[i])
-						{
-							idx[i] = 0;
-							fct_stepper.reset(i);
-							res_stepper.reset(i);
-						}
-						else
-						{
-							idx[i]++;
-							fct_stepper.step(i, 1);
-							res_stepper.step(i, 1);
-							break;
-						}
+						fct_stepper.step(i + step_dim, idx[i]);
+						res_stepper.step(i + step_dim, idx[i]);
 					}
+					first_step = false;
 				}
-				else {
-					std::cout << "oops" << std::endl;
+
+				for (std::size_t i = 0; i < simd_size; ++i)
+				{
+					res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
+				}
+				for (std::size_t i = 0; i < simd_rest; ++i)
+				{
+					*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+					res_stepper.step_leading();
+					fct_stepper.step_leading();
+				}
+				
+				// next unaligned index
+				is_row_major ?
+					strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
+					strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+					
+				// need to step E1 as well if not contigous assign (e.g. view)
+				fct_stepper.to_begin();
+				res_stepper.to_begin();
+				for (std::size_t i = 0; i < idx.size(); ++i)
+				{
+					fct_stepper.step(i + step_dim, idx[i]);
+					res_stepper.step(i + step_dim, idx[i]);
 				}
-// 				std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<<idx.size() << "," << cut << std::endl;
-			}
-			for (std::size_t i = 0; i < simd_size; ++i)
-			{
-				res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
-			}
-			for (std::size_t i = 0; i < simd_rest; ++i)
-			{
-				*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
-				res_stepper.step_leading();
-				fct_stepper.step_leading();
 			}
-        }
-#else
+		}
+		else {
+		
+#endif
         for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
         {
             for (std::size_t i = 0; i < simd_size; ++i)
@@ -1293,9 +1082,9 @@ namespace xt
                 fct_stepper.step_leading();
             }
 
-            is_row_major
-                ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
-                : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+            is_row_major ?
+                strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
+                strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
 
             fct_stepper.to_begin();
 
@@ -1317,7 +1106,8 @@ namespace xt
                 }
             }
         }
-
+#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
+		}
 #endif
 	}
 
diff --git a/include/xtensor/xstrides.hpp b/include/xtensor/xstrides.hpp
index ae261e354..99ffa811c 100644
--- a/include/xtensor/xstrides.hpp
+++ b/include/xtensor/xstrides.hpp
@@ -544,8 +544,10 @@ namespace xt
         {
             for (std::size_t i = strides.size(); i != 0; --i)
             {
+// 				std::cout << "matching: " << strides[i - 1] << " && " << shape[i - 1]<< " && " << data_size << std::endl;
                 if (!stride_match_condition(strides[i - 1], shape[i - 1], data_size, zero_strides))
                 {
+// 					std::cout << "non-matching: " << strides[i - 1] << " != " << data_size << std::endl;
                     return false;
                 }
                 data_size *= static_cast<value_type>(shape[i - 1]);
diff --git a/include/xtensor/xview.hpp b/include/xtensor/xview.hpp
index 0a2dbb6be..931bb67a9 100644
--- a/include/xtensor/xview.hpp
+++ b/include/xtensor/xview.hpp
@@ -1,11 +1,11 @@
 /***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
- * Copyright (c) QuantStack                                                 *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
+* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
+* Copyright (c) QuantStack                                                 *
+*                                                                          *
+* Distributed under the terms of the BSD 3-Clause License.                 *
+*                                                                          *
+* The full license is in the file LICENSE, distributed with this software. *
+****************************************************************************/
 
 #ifndef XTENSOR_VIEW_HPP
 #define XTENSOR_VIEW_HPP
@@ -18,8 +18,8 @@
 #include <utility>
 
 #include <xtl/xclosure.hpp>
-#include <xtl/xmeta_utils.hpp>
 #include <xtl/xsequence.hpp>
+#include <xtl/xmeta_utils.hpp>
 #include <xtl/xtype_traits.hpp>
 
 #include "xaccessible.hpp"
@@ -34,6 +34,7 @@
 #include "xtensor_forward.hpp"
 #include "xview_utils.hpp"
 
+
 namespace xt
 {
 
@@ -75,22 +76,26 @@ namespace xt
     {
 
         template <class T>
-        struct is_xrange : std::false_type
+        struct is_xrange
+            : std::false_type
         {
         };
 
         template <class T>
-        struct is_xrange<xrange<T>> : std::true_type
+        struct is_xrange<xrange<T>>
+            : std::true_type
         {
         };
 
         template <class S>
-        struct is_xall_slice : std::false_type
+        struct is_xall_slice
+            : std::false_type
         {
         };
 
         template <class T>
-        struct is_xall_slice<xall<T>> : std::true_type
+        struct is_xall_slice<xall<T>>
+            : std::true_type
         {
         };
 
@@ -129,33 +134,32 @@ namespace xt
         template <class CT, class... S>
         struct is_xscalar_impl<xview<CT, S...>>
         {
-            static constexpr bool value = static_cast<std::ptrdiff_t>(integral_count<S...>()
-                                          ) == static_dimension<typename std::decay_t<CT>::shape_type>::value
-                                              ? true
-                                              : false;
+            static constexpr bool value = static_cast<std::ptrdiff_t>(integral_count<S...>()) == static_dimension<typename std::decay_t<CT>::shape_type>::value ? true : false;
         };
 
         template <class S>
-        struct is_strided_slice_impl : std::true_type
+        struct is_strided_slice_impl
+            : std::true_type
         {
         };
 
         template <class T>
-        struct is_strided_slice_impl<xkeep_slice<T>> : std::false_type
+        struct is_strided_slice_impl<xkeep_slice<T>>
+            : std::false_type
         {
         };
 
         template <class T>
-        struct is_strided_slice_impl<xdrop_slice<T>> : std::false_type
+        struct is_strided_slice_impl<xdrop_slice<T>>
+            : std::false_type
         {
         };
 
         // If we have no discontiguous slices, we can calculate strides for this view.
         template <class E, class... S>
         struct is_strided_view
-            : std::integral_constant<
-                  bool,
-                  xtl::conjunction<has_data_interface<E>, is_strided_slice_impl<std::decay_t<S>>...>::value>
+            : std::integral_constant<bool, xtl::conjunction<has_data_interface<E>,
+                                                            is_strided_slice_impl<std::decay_t<S>>...>::value>
         {
         };
 
@@ -179,20 +183,25 @@ namespace xt
             static constexpr bool have_all_seen = all_seen || is_all_slice;
             static constexpr bool have_range_seen = is_range_slice;
 
-            static constexpr bool is_valid = valid
-                                             && (have_all_seen
-                                                     ? is_all_slice
-                                                     : (!range_seen && (is_int_slice || is_range_slice)));
-
-            static constexpr bool value = is_contiguous_view_impl < layout_type::row_major, is_valid,
-                                  have_all_seen, range_seen || is_range_slice,
-                                  xtl::mpl::pop_front_t < V >> ::value;
+            static constexpr bool is_valid = valid && (have_all_seen ? is_all_slice : (!range_seen && (is_int_slice || is_range_slice)));
+
+            static constexpr bool value = is_contiguous_view_impl<layout_type::row_major,
+                                                                  is_valid,
+                                                                  have_all_seen,
+                                                                  range_seen || is_range_slice,
+                                                                  xtl::mpl::pop_front_t<V>>::value;
+			static constexpr size_t contiguous_dimensions = is_contiguous_view_impl<layout_type::row_major,
+                                                                  is_valid,
+                                                                  have_all_seen,
+                                                                  range_seen || is_range_slice,
+                                                                  xtl::mpl::pop_front_t<V>>::contiguous_dimensions + (value ? 1 :0);
         };
 
         template <bool valid, bool all_seen, bool range_seen>
         struct is_contiguous_view_impl<layout_type::row_major, valid, all_seen, range_seen, xtl::mpl::vector<>>
         {
             static constexpr bool value = valid;
+			static constexpr size_t contiguous_dimensions = 0;
         };
 
         // For column major the *same* but reverse is true -- with the additional
@@ -208,13 +217,12 @@ namespace xt
 
             static constexpr bool have_int_seen = int_seen || is_int_slice;
 
-            static constexpr bool is_valid = valid
-                                             && (have_int_seen
-                                                     ? is_int_slice
-                                                     : (!range_seen && (is_all_slice || is_range_slice)));
-            static constexpr bool value = is_contiguous_view_impl < layout_type::column_major, is_valid,
-                                  have_int_seen, is_range_slice || range_seen,
-                                  xtl::mpl::pop_front_t < V >> ::value;
+            static constexpr bool is_valid = valid && (have_int_seen ? is_int_slice : (!range_seen && (is_all_slice || is_range_slice)));
+            static constexpr bool value = is_contiguous_view_impl<layout_type::column_major,
+                                                                  is_valid,
+                                                                  have_int_seen,
+                                                                  is_range_slice || range_seen,
+                                                                  xtl::mpl::pop_front_t<V>>::value;
         };
 
         template <bool valid, bool int_seen, bool range_seen>
@@ -226,14 +234,11 @@ namespace xt
         // TODO relax has_data_interface constraint here!
         template <class E, class... S>
         struct is_contiguous_view
-            : std::integral_constant<
-                  bool,
-                  has_data_interface<E>::value
-                      && !(
-                          E::static_layout == layout_type::column_major
-                          && static_cast<std::size_t>(static_dimension<typename E::shape_type>::value) != sizeof...(S)
-                      )
-                      && is_contiguous_view_impl<E::static_layout, true, false, false, xtl::mpl::vector<S...>>::value>
+            : std::integral_constant<bool,
+                has_data_interface<E>::value &&
+                !(E::static_layout == layout_type::column_major && static_cast<std::size_t>(static_dimension<typename E::shape_type>::value) != sizeof...(S)) &&
+                is_contiguous_view_impl<E::static_layout, true, false, false, xtl::mpl::vector<S...>>::value
+              >
         {
         };
 
@@ -271,20 +276,21 @@ namespace xt
         struct get_contigous_shape_type
         {
             // if we have no `range` in the slices we can re-use the shape with an offset
-            using type = std::conditional_t<
-                xtl::disjunction<is_xrange<S>...>::value,
-                typename xview_shape_type<typename E::shape_type, S...>::type,
-                // In the false branch we know that we have only integers at the front OR end, and NO range
-                typename unwrap_offset_container<E::static_layout, typename E::inner_shape_type, integral_count<S...>()>::type>;
+            using type = std::conditional_t<xtl::disjunction<is_xrange<S>...>::value,
+                                            typename xview_shape_type<typename E::shape_type, S...>::type,
+                                            // In the false branch we know that we have only integers at the front OR end, and NO range
+                                            typename unwrap_offset_container<E::static_layout, typename E::inner_shape_type, integral_count<S...>()>::type>;
         };
 
         template <class T>
-        struct is_sequence_view : std::integral_constant<bool, false>
+        struct is_sequence_view
+            : std::integral_constant<bool, false>
         {
         };
 
         template <class T, std::ptrdiff_t S, std::ptrdiff_t E>
-        struct is_sequence_view<sequence_view<T, S, E>> : std::integral_constant<bool, true>
+        struct is_sequence_view<sequence_view<T, S, E>>
+            : std::integral_constant<bool, true>
         {
         };
     }
@@ -298,16 +304,15 @@ namespace xt
         using size_type = typename xexpression_type::size_type;
         using temporary_type = view_temporary_type_t<xexpression_type, S...>;
 
-        static constexpr layout_type layout = detail::is_contiguous_view<xexpression_type, S...>::value
-                                                  ? xexpression_type::static_layout
-                                                  : layout_type::dynamic;
+        static constexpr layout_type layout =
+            detail::is_contiguous_view<xexpression_type, S...>::value ?
+                xexpression_type::static_layout : layout_type::dynamic;
 
         static constexpr bool is_const = std::is_const<std::remove_reference_t<CT>>::value;
 
-        using extract_storage_type = xtl::mpl::eval_if_t<
-            has_data_interface<xexpression_type>,
-            detail::expr_storage_type<xexpression_type>,
-            make_invalid_type<>>;
+        using extract_storage_type = xtl::mpl::eval_if_t<has_data_interface<xexpression_type>,
+                                                         detail::expr_storage_type<xexpression_type>,
+                                                         make_invalid_type<>>;
         using storage_type = std::conditional_t<is_const, const extract_storage_type, extract_storage_type>;
     };
 
@@ -319,20 +324,17 @@ namespace xt
         static constexpr bool is_strided_view = detail::is_strided_view<xexpression_type, S...>::value;
         static constexpr bool is_contiguous_view = detail::is_contiguous_view<xexpression_type, S...>::value;
 
-        using inner_shape_type = std::conditional_t<
-            is_contiguous_view,
-            typename detail::get_contigous_shape_type<xexpression_type, S...>::type,
-            typename xview_shape_type<typename xexpression_type::shape_type, S...>::type>;
+        using inner_shape_type = std::conditional_t<is_contiguous_view,
+                                                    typename detail::get_contigous_shape_type<xexpression_type, S...>::type,
+                                                    typename xview_shape_type<typename xexpression_type::shape_type, S...>::type>;
 
-        using stepper = std::conditional_t<
-            is_strided_view,
-            xstepper<xview<CT, S...>>,
-            xview_stepper<std::is_const<std::remove_reference_t<CT>>::value, CT, S...>>;
+        using stepper = std::conditional_t<is_strided_view,
+                                           xstepper<xview<CT, S...>>,
+                                           xview_stepper<std::is_const<std::remove_reference_t<CT>>::value, CT, S...>>;
 
-        using const_stepper = std::conditional_t<
-            is_strided_view,
-            xstepper<const xview<CT, S...>>,
-            xview_stepper<true, std::remove_cv_t<CT>, S...>>;
+        using const_stepper = std::conditional_t<is_strided_view,
+                                                 xstepper<const xview<CT, S...>>,
+                                                 xview_stepper<true, std::remove_cv_t<CT>, S...>>;
     };
 
     /**
@@ -352,9 +354,9 @@ namespace xt
     template <class CT, class... S>
     class xview : public xview_semantic<xview<CT, S...>>,
                   public std::conditional_t<
-                      detail::is_contiguous_view<std::decay_t<CT>, S...>::value,
-                      xcontiguous_iterable<xview<CT, S...>>,
-                      xiterable<xview<CT, S...>>>,
+                    detail::is_contiguous_view<std::decay_t<CT>, S...>::value,
+                    xcontiguous_iterable<xview<CT, S...>>,
+                    xiterable<xview<CT, S...>>>,
                   public xaccessible<xview<CT, S...>>,
                   public extension::xview_base_t<CT, S...>
     {
@@ -376,8 +378,9 @@ namespace xt
         using bool_load_type = typename xexpression_type::bool_load_type;
         using reference = typename inner_types::reference;
         using const_reference = typename inner_types::const_reference;
-        using pointer = std::
-            conditional_t<is_const, typename xexpression_type::const_pointer, typename xexpression_type::pointer>;
+        using pointer = std::conditional_t<is_const,
+                                           typename xexpression_type::const_pointer,
+                                           typename xexpression_type::pointer>;
         using const_pointer = typename xexpression_type::const_pointer;
         using size_type = typename inner_types::size_type;
         using difference_type = typename xexpression_type::difference_type;
@@ -392,35 +395,28 @@ namespace xt
         using inner_shape_type = typename iterable_base::inner_shape_type;
         using shape_type = typename xview_shape_type<typename xexpression_type::shape_type, S...>::type;
 
-        using xexpression_inner_strides_type = xtl::mpl::eval_if_t<
-            has_strides<xexpression_type>,
-            detail::expr_inner_strides_type<xexpression_type>,
-            get_strides_type<shape_type>>;
+        using xexpression_inner_strides_type = xtl::mpl::eval_if_t<has_strides<xexpression_type>,
+                                                                   detail::expr_inner_strides_type<xexpression_type>,
+                                                                   get_strides_type<shape_type>>;
 
-        using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t<
-            has_strides<xexpression_type>,
-            detail::expr_inner_backstrides_type<xexpression_type>,
-            get_strides_type<shape_type>>;
+        using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t<has_strides<xexpression_type>,
+                                                                       detail::expr_inner_backstrides_type<xexpression_type>,
+                                                                       get_strides_type<shape_type>>;
 
         using storage_type = typename inner_types::storage_type;
 
-        static constexpr bool has_trivial_strides = is_contiguous_view
-                                                    && !xtl::disjunction<detail::is_xrange<S>...>::value;
-        using inner_strides_type = std::conditional_t<
-            has_trivial_strides,
-            typename detail::unwrap_offset_container<
-                xexpression_type::static_layout,
-                xexpression_inner_strides_type,
-                integral_count<S...>()>::type,
-            get_strides_t<shape_type>>;
-
-        using inner_backstrides_type = std::conditional_t<
-            has_trivial_strides,
-            typename detail::unwrap_offset_container<
-                xexpression_type::static_layout,
-                xexpression_inner_backstrides_type,
-                integral_count<S...>()>::type,
-            get_strides_t<shape_type>>;
+        static constexpr bool has_trivial_strides = is_contiguous_view && !xtl::disjunction<detail::is_xrange<S>...>::value;
+        using inner_strides_type = std::conditional_t<has_trivial_strides,
+                                                      typename detail::unwrap_offset_container<xexpression_type::static_layout,
+                                                                                               xexpression_inner_strides_type,
+                                                                                               integral_count<S...>()>::type,
+                                                      get_strides_t<shape_type>>;
+
+        using inner_backstrides_type = std::conditional_t<has_trivial_strides,
+                                                          typename detail::unwrap_offset_container<xexpression_type::static_layout,
+                                                                                                   xexpression_inner_backstrides_type,
+                                                                                                   integral_count<S...>()>::type,
+                                                          get_strides_t<shape_type>>;
 
         using strides_type = get_strides_t<shape_type>;
         using backstrides_type = strides_type;
@@ -431,21 +427,21 @@ namespace xt
         using stepper = typename iterable_base::stepper;
         using const_stepper = typename iterable_base::const_stepper;
 
-        using linear_iterator = std::conditional_t<
-            has_data_interface<xexpression_type>::value && is_strided_view,
-            std::conditional_t<is_const, typename xexpression_type::const_linear_iterator, typename xexpression_type::linear_iterator>,
-            typename iterable_base::linear_iterator>;
-        using const_linear_iterator = std::conditional_t<
-            has_data_interface<xexpression_type>::value && is_strided_view,
-            typename xexpression_type::const_linear_iterator,
-            typename iterable_base::const_linear_iterator>;
+        using linear_iterator = std::conditional_t<has_data_interface<xexpression_type>::value && is_strided_view,
+                                                    std::conditional_t<is_const,
+                                                        typename xexpression_type::const_linear_iterator,
+                                                        typename xexpression_type::linear_iterator>,
+                                                    typename iterable_base::linear_iterator>;
+        using const_linear_iterator = std::conditional_t<has_data_interface<xexpression_type>::value && is_strided_view,
+                                                          typename xexpression_type::const_linear_iterator,
+                                                          typename iterable_base::const_linear_iterator>;
 
         using reverse_linear_iterator = std::reverse_iterator<linear_iterator>;
         using const_reverse_linear_iterator = std::reverse_iterator<const_linear_iterator>;
 
         using container_iterator = pointer;
         using const_container_iterator = const_pointer;
-        static constexpr std::size_t rank = SIZE_MAX;
+        constexpr static std::size_t rank = SIZE_MAX;
 
         // The FSL argument prevents the compiler from calling this constructor
         // instead of the copy constructor when sizeof...(SL) == 0.
@@ -494,36 +490,48 @@ namespace xt
         bool has_linear_assign(const ST& strides) const;
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, stepper> stepper_begin(const ST& shape);
+        std::enable_if_t<!Enable, stepper>
+        stepper_begin(const ST& shape);
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, stepper> stepper_end(const ST& shape, layout_type l);
+        std::enable_if_t<!Enable, stepper>
+        stepper_end(const ST& shape, layout_type l);
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, const_stepper> stepper_begin(const ST& shape) const;
+        std::enable_if_t<!Enable, const_stepper>
+        stepper_begin(const ST& shape) const;
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, const_stepper> stepper_end(const ST& shape, layout_type l) const;
+        std::enable_if_t<!Enable, const_stepper>
+        stepper_end(const ST& shape, layout_type l) const;
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, stepper> stepper_begin(const ST& shape);
+        std::enable_if_t<Enable, stepper>
+        stepper_begin(const ST& shape);
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, stepper> stepper_end(const ST& shape, layout_type l);
+        std::enable_if_t<Enable, stepper>
+        stepper_end(const ST& shape, layout_type l);
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, const_stepper> stepper_begin(const ST& shape) const;
+        std::enable_if_t<Enable, const_stepper>
+        stepper_begin(const ST& shape) const;
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, const_stepper> stepper_end(const ST& shape, layout_type l) const;
+        std::enable_if_t<Enable, const_stepper>
+        stepper_end(const ST& shape, layout_type l) const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value, storage_type&> storage();
+        std::enable_if_t<has_data_interface<T>::value, storage_type&>
+        storage();
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value, const storage_type&> storage() const;
+        std::enable_if_t<has_data_interface<T>::value, const storage_type&>
+        storage() const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator> linear_begin();
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator>
+        linear_begin();
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator> linear_end();
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator>
+        linear_end();
 
         template <class T = xexpression_type>
         std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_linear_iterator>
@@ -574,10 +582,12 @@ namespace xt
         backstrides() const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer> data() const;
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer>
+        data() const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer> data();
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer>
+        data();
 
         template <class T = xexpression_type>
         std::enable_if_t<has_data_interface<T>::value && is_strided_view, std::size_t>
@@ -610,13 +620,11 @@ namespace xt
         size_type underlying_size(size_type dim) const;
 
         xtl::xclosure_pointer<self_type&> operator&() &;
-        xtl::xclosure_pointer<const self_type&> operator&() const&;
+        xtl::xclosure_pointer<const self_type&> operator&() const &;
         xtl::xclosure_pointer<self_type> operator&() &&;
 
-        template <
-            class E,
-            class T = xexpression_type,
-            class = std::enable_if_t<has_data_interface<T>::value && is_contiguous_view, int>>
+        template <class E, class T = xexpression_type,
+                  class = std::enable_if_t<has_data_interface<T>::value && is_contiguous_view, int>>
         void assign_to(xexpression<E>& e, bool force_resize) const;
 
         template <class E>
@@ -638,11 +646,9 @@ namespace xt
         template <class align, class simd, class T = xexpression_type>
         enable_simd_interface<T, void> store_simd(size_type i, const simd& e);
 
-        template <
-            class align,
-            class requested_type = value_type,
-            std::size_t N = xt_simd::simd_traits<requested_type>::size,
-            class T = xexpression_type>
+        template <class align, class requested_type = value_type,
+                  std::size_t N = xt_simd::simd_traits<requested_type>::size,
+                  class T = xexpression_type>
         enable_simd_interface<T, simd_return_type<requested_type>> load_simd(size_type i) const;
 
         template <class T = xexpression_type>
@@ -783,7 +789,9 @@ namespace xt
     {
     public:
 
-        using view_type = std::conditional_t<is_const, const xview<CT, S...>, xview<CT, S...>>;
+        using view_type = std::conditional_t<is_const,
+                                             const xview<CT, S...>,
+                                             xview<CT, S...>>;
         using substepper_type = get_stepper<view_type>;
 
         using value_type = typename substepper_type::value_type;
@@ -795,13 +803,8 @@ namespace xt
         using shape_type = typename substepper_type::shape_type;
 
         xview_stepper() = default;
-        xview_stepper(
-            view_type* view,
-            substepper_type it,
-            size_type offset,
-            bool end = false,
-            layout_type l = XTENSOR_DEFAULT_TRAVERSAL
-        );
+        xview_stepper(view_type* view, substepper_type it,
+                      size_type offset, bool end = false, layout_type l = XTENSOR_DEFAULT_TRAVERSAL);
 
         reference operator*() const;
 
@@ -884,7 +887,7 @@ namespace xt
             std::forward<CTA>(e),
             std::forward<FSL>(first_slice),
             std::forward<SL>(slices)...
-        )
+          )
     {
     }
 
@@ -892,26 +895,25 @@ namespace xt
     template <class CT, class... S>
     template <class CTA, class FSL, class... SL>
     xview<CT, S...>::xview(std::true_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept
-        : m_e(std::forward<CTA>(e))
-        , m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...)
-        , m_shape(compute_shape(detail::is_sequence_view<inner_shape_type>{}))
-        , m_strides(m_e.strides())
-        , m_backstrides(m_e.backstrides())
-        , m_data_offset(data_offset_impl(std::make_index_sequence<sizeof...(S)>()))
-        , m_strides_computed(true)
+        : m_e(std::forward<CTA>(e)),
+          m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...),
+          m_shape(compute_shape(detail::is_sequence_view<inner_shape_type>{})),
+          m_strides(m_e.strides()),
+          m_backstrides(m_e.backstrides()),
+          m_data_offset(data_offset_impl(std::make_index_sequence<sizeof...(S)>())),
+          m_strides_computed(true)
     {
     }
 
     template <class CT, class... S>
     template <class CTA, class FSL, class... SL>
     xview<CT, S...>::xview(std::false_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept
-        : m_e(std::forward<CTA>(e))
-        , m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...)
-        , m_shape(compute_shape(std::false_type{}))
-        , m_strides_computed(false)
+        : m_e(std::forward<CTA>(e)),
+          m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...),
+          m_shape(compute_shape(std::false_type{})),
+          m_strides_computed(false)
     {
     }
-
     //@}
 
     template <class CT, class... S>
@@ -934,7 +936,6 @@ namespace xt
     {
         return semantic_base::operator=(e);
     }
-
     //@}
 
     template <class CT, class... S>
@@ -973,36 +974,50 @@ namespace xt
     template <class CT, class... S>
     inline layout_type xview<CT, S...>::layout() const noexcept
     {
-        return xtl::mpl::static_if<is_strided_view>(
-            [&](auto self)
+        return xtl::mpl::static_if<is_strided_view>([&](auto self)
+        {
+            if (static_layout != layout_type::dynamic)
             {
-                if (static_layout != layout_type::dynamic)
-                {
-                    return static_layout;
-                }
-                else
-                {
-                    bool strides_match = do_strides_match(
-                        self(this)->shape(),
-                        self(this)->strides(),
-                        self(this)->m_e.layout(),
-                        true
-                    );
-                    return strides_match ? self(this)->m_e.layout() : layout_type::dynamic;
-                }
-            },
-            /* else */
-            [&](auto /*self*/)
+                return static_layout;
+            }
+            else
             {
-                return layout_type::dynamic;
+				return self(this)->m_e.layout()==layout_type::row_major ?
+					((self(this)->strides()[self(this)->strides().size()-1] == 1) ? self(this)->m_e.layout() : layout_type::dynamic) :
+					((self(this)->strides()[0] == 1) ? self(this)->m_e.layout() : layout_type::dynamic);
+					
+                bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true);
+                return strides_match ? self(this)->m_e.layout() : layout_type::dynamic;
             }
-        );
+        },
+        /* else */ [&](auto /*self*/)
+        {
+			std::cout << "View is not strided!" << std::endl;
+            return layout_type::dynamic;
+        });
     }
 
     template <class CT, class... S>
     inline bool xview<CT, S...>::is_contiguous() const noexcept
     {
-        return layout() != layout_type::dynamic;
+		return xtl::mpl::static_if<is_strided_view>([&](auto self)
+        {
+            if (static_layout != layout_type::dynamic)
+            {
+                return true;
+            }
+            else
+            {
+                bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true);
+                return strides_match ? self(this)->m_e.layout()!=layout_type::dynamic : false;
+            }
+        },
+        /* else */ [&](auto /*self*/)
+        {
+            return false;
+        });
+		
+//         return layout() != layout_type::dynamic;
     }
 
     //@}
@@ -1020,17 +1035,13 @@ namespace xt
     template <class T>
     inline void xview<CT, S...>::fill(const T& value)
     {
-        xtl::mpl::static_if<static_layout != layout_type::dynamic>(
-            [&](auto self)
-            {
-                std::fill(self(this)->linear_begin(), self(this)->linear_end(), value);
-            },
-            /*else*/
-            [&](auto self)
-            {
-                std::fill(self(this)->begin(), self(this)->end(), value);
-            }
-        );
+        xtl::mpl::static_if<static_layout != layout_type::dynamic>([&](auto self)
+        {
+            std::fill(self(this)->linear_begin(), self(this)->linear_end(), value);
+        }, /*else*/ [&](auto self)
+        {
+            std::fill(self(this)->begin(), self(this)->end(), value);
+        });
     }
 
     /**
@@ -1076,6 +1087,7 @@ namespace xt
         return unchecked_impl(make_index_sequence(args...), static_cast<size_type>(args)...);
     }
 
+
     template <class CT, class... S>
     template <class It>
     inline auto xview<CT, S...>::element(It first, It last) -> reference
@@ -1163,15 +1175,16 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::storage() -> std::enable_if_t<has_data_interface<T>::value, storage_type&>
+    inline auto xview<CT, S...>::storage() ->
+        std::enable_if_t<has_data_interface<T>::value, storage_type&>
     {
         return m_e.storage();
     }
 
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::storage() const
-        -> std::enable_if_t<has_data_interface<T>::value, const storage_type&>
+    inline auto xview<CT, S...>::storage() const ->
+        std::enable_if_t<has_data_interface<T>::value, const storage_type&>
     {
         return m_e.storage();
     }
@@ -1277,8 +1290,8 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::strides() const
-        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
+    inline auto xview<CT, S...>::strides() const ->
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
     {
         if (!m_strides_computed)
         {
@@ -1290,8 +1303,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::backstrides() const
-        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
+    inline auto xview<CT, S...>::backstrides() const ->
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
     {
         if (!m_strides_computed)
         {
@@ -1306,16 +1319,16 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::data() const
-        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer>
+    inline auto xview<CT, S...>::data() const ->
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer>
     {
         return m_e.data();
     }
 
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::data()
-        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer>
+    inline auto xview<CT, S...>::data() ->
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer>
     {
         return m_e.data();
     }
@@ -1324,9 +1337,9 @@ namespace xt
     template <std::size_t... I>
     inline std::size_t xview<CT, S...>::data_offset_impl(std::index_sequence<I...>) const noexcept
     {
-        auto temp = std::array<std::ptrdiff_t, sizeof...(S)>(
-            {(static_cast<ptrdiff_t>(xt::value(std::get<I>(m_slices), 0)))...}
-        );
+        auto temp = std::array<std::ptrdiff_t, sizeof...(S)>({
+            (static_cast<ptrdiff_t>(xt::value(std::get<I>(m_slices), 0)))...
+        });
 
         std::ptrdiff_t result = 0;
         std::size_t i = 0;
@@ -1346,8 +1359,8 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::data_offset() const noexcept
-        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, std::size_t>
+    inline auto xview<CT, S...>::data_offset() const noexcept ->
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, std::size_t>
     {
         if (!m_strides_computed)
         {
@@ -1356,7 +1369,6 @@ namespace xt
         }
         return m_data_offset;
     }
-
     //@}
 
     template <class CT, class... S>
@@ -1372,7 +1384,7 @@ namespace xt
     }
 
     template <class CT, class... S>
-    inline auto xview<CT, S...>::operator&() const& -> xtl::xclosure_pointer<const self_type&>
+    inline auto xview<CT, S...>::operator&() const & -> xtl::xclosure_pointer<const self_type&>
     {
         return xtl::closure_pointer(*this);
     }
@@ -1409,20 +1421,15 @@ namespace xt
     template <class ST>
     inline bool xview<CT, S...>::has_linear_assign(const ST& str) const
     {
-        return xtl::mpl::static_if<is_strided_view>(
-            [&](auto self)
-            {
-                return str.size() == self(this)->strides().size()
-                       && std::equal(str.cbegin(), str.cend(), self(this)->strides().begin());
-            },
-            /*else*/
-            [](auto /*self*/)
-            {
-                return false;
-            }
-        );
-    }
+        return xtl::mpl::static_if<is_strided_view>([&](auto self)
+        {
+            return str.size() == self(this)->strides().size() &&
+                std::equal(str.cbegin(), str.cend(), self(this)->strides().begin());
 
+        }, /*else*/ [](auto /*self*/){
+            return false;
+        });
+    }
     //@}
 
     template <class CT, class... S>
@@ -1458,8 +1465,7 @@ namespace xt
     }
 
     template <class CT, class... S>
-    inline auto xview<CT, S...>::data_xend(layout_type l, size_type offset) const noexcept
-        -> const_container_iterator
+    inline auto xview<CT, S...>::data_xend(layout_type l, size_type offset) const noexcept -> const_container_iterator
     {
         return data_xend_impl(data() + data_offset(), l, offset);
     }
@@ -1497,8 +1503,7 @@ namespace xt
 
     template <class CT, class... S>
     template <class align, class requested_type, std::size_t N, class T>
-    inline auto xview<CT, S...>::load_simd(size_type i) const
-        -> enable_simd_interface<T, simd_return_type<requested_type>>
+    inline auto xview<CT, S...>::load_simd(size_type i) const -> enable_simd_interface<T, simd_return_type<requested_type>>
     {
         return m_e.template load_simd<xt_simd::unaligned_mode, requested_type>(data_offset() + i);
     }
@@ -1537,10 +1542,9 @@ namespace xt
     template <class... Args>
     inline auto xview<CT, S...>::make_index_sequence(Args...) const noexcept
     {
-        return std::make_index_sequence<
-            (sizeof...(Args) + integral_count<S...>() > newaxis_count<S...>()
-                 ? sizeof...(Args) + integral_count<S...>() - newaxis_count<S...>()
-                 : 0)>();
+        return std::make_index_sequence<(sizeof...(Args)+integral_count<S...>() > newaxis_count<S...>() ?
+                                         sizeof...(Args)+integral_count<S...>() - newaxis_count<S...>() :
+                                         0)>();
     }
 
     template <class CT, class... S>
@@ -1548,12 +1552,11 @@ namespace xt
     inline auto xview<CT, S...>::compute_strides_impl(std::index_sequence<I...>) const noexcept
     {
         std::size_t original_dim = m_e.dimension();
-        return std::array<std::ptrdiff_t, sizeof...(I)>(
-            {(static_cast<std::ptrdiff_t>(xt::step_size(std::get<integral_skip<S...>(I)>(m_slices), 1))
-              * ((integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))) < original_dim
-                     ? m_e.strides()[integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))]
-                     : 1))...}
-        );
+        return std::array<std::ptrdiff_t, sizeof...(I)>({
+            (static_cast<std::ptrdiff_t>(xt::step_size(std::get<integral_skip<S...>(I)>(m_slices), 1)) *
+                ((integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))) < original_dim ?
+                m_e.strides()[integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))] : 1))...
+        });
     }
 
     template <class CT, class... S>
@@ -1629,8 +1632,7 @@ namespace xt
 
     template <class CT, class... S>
     template <typename std::decay_t<CT>::size_type... I, class... Args>
-    inline auto xview<CT, S...>::unchecked_impl(std::index_sequence<I...>, Args... args) const
-        -> const_reference
+    inline auto xview<CT, S...>::unchecked_impl(std::index_sequence<I...>, Args... args) const -> const_reference
     {
         return m_e.unchecked(index<I>(args...)...);
     }
@@ -1651,19 +1653,15 @@ namespace xt
 
     template <class CT, class... S>
     template <typename std::decay_t<CT>::size_type I, class... Args>
-    inline auto xview<CT, S...>::index(Args... args) const
-        -> std::enable_if_t<lesser_condition<I>::value, size_type>
+    inline auto xview<CT, S...>::index(Args... args) const -> std::enable_if_t<lesser_condition<I>::value, size_type>
     {
-        return sliced_access<I - integral_count_before<S...>(I) + newaxis_count_before<S...>(I + 1)>(
-            std::get<I + newaxis_count_before<S...>(I + 1)>(m_slices),
-            args...
-        );
+        return sliced_access<I - integral_count_before<S...>(I) + newaxis_count_before<S...>(I + 1)>
+            (std::get<I + newaxis_count_before<S...>(I + 1)>(m_slices), args...);
     }
 
     template <class CT, class... S>
     template <typename std::decay_t<CT>::size_type I, class... Args>
-    inline auto xview<CT, S...>::index(Args... args) const
-        -> std::enable_if_t<!lesser_condition<I>::value, size_type>
+    inline auto xview<CT, S...>::index(Args... args) const -> std::enable_if_t<!lesser_condition<I>::value, size_type>
     {
         return argument<I - integral_count<S...>() + newaxis_count<S...>()>(args...);
     }
@@ -1680,8 +1678,7 @@ namespace xt
     inline auto xview<CT, S...>::sliced_access(const xslice<T>& slice, Arg arg, Args... args) const -> size_type
     {
         using ST = typename T::size_type;
-        return static_cast<size_type>(slice.derived_cast(
-        )(argument<I>(static_cast<ST>(arg), static_cast<ST>(args)...)));
+        return static_cast<size_type>(slice.derived_cast()(argument<I>(static_cast<ST>(arg), static_cast<ST>(args)...)));
     }
 
     template <class CT, class... S>
@@ -1698,17 +1695,14 @@ namespace xt
         auto index = xtl::make_sequence<base_index_type>(m_e.dimension(), 0);
         using diff_type = typename std::iterator_traits<It>::difference_type;
         using ivalue_type = typename base_index_type::value_type;
-        auto func1 = [&first](const auto& s) noexcept
-        {
+        auto func1 = [&first](const auto& s) noexcept {
             return get_slice_value(s, first);
         };
-        auto func2 = [](const auto& s) noexcept
-        {
+        auto func2 = [](const auto& s) noexcept {
             return xt::value(s, 0);
         };
 
-        auto s = static_cast<diff_type>((std::min
-        )(static_cast<size_type>(std::distance(first, last)), this->dimension()));
+        auto s = static_cast<diff_type>((std::min)(static_cast<size_type>(std::distance(first, last)), this->dimension()));
         auto first_copy = last - s;
         for (size_type i = 0; i != m_e.dimension(); ++i)
         {
@@ -1720,12 +1714,13 @@ namespace xt
 
             if (first < last)
             {
-                index[i] = k < sizeof...(S) ? apply<size_type>(k, func1, m_slices)
-                                            : static_cast<ivalue_type>(*first);
+                index[i] = k < sizeof...(S) ?
+                    apply<size_type>(k, func1, m_slices) : static_cast<ivalue_type>(*first);
             }
             else
             {
-                index[i] = k < sizeof...(S) ? apply<size_type>(k, func2, m_slices) : ivalue_type(0);
+                index[i] = k < sizeof...(S) ?
+                    apply<size_type>(k, func2, m_slices) : ivalue_type(0);
             }
         }
         return index;
@@ -1742,15 +1737,12 @@ namespace xt
     {
         std::size_t dim = m_e.dimension() - integral_count<S...>() + newaxis_count<S...>();
         auto shape = xtl::make_sequence<inner_shape_type>(dim, 0);
-        auto func = [](const auto& s) noexcept
-        {
-            return get_size(s);
-        };
+        auto func = [](const auto& s) noexcept { return get_size(s); };
         for (size_type i = 0; i != dim; ++i)
         {
             size_type index = integral_skip<S...>(i);
-            shape[i] = index < sizeof...(S) ? apply<size_type>(index, func, m_slices)
-                                            : m_e.shape()[index - newaxis_count_before<S...>(index)];
+            shape[i] = index < sizeof...(S) ?
+                apply<size_type>(index, func, m_slices) : m_e.shape()[index - newaxis_count_before<S...>(index)];
         }
         return shape;
     }
@@ -1764,8 +1756,7 @@ namespace xt
         }
 
         template <class V, class T>
-        inline void
-        run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */)
+        inline void run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */)
         {
             std::copy(t.cbegin(), t.cend(), v.begin());
         }
@@ -1774,8 +1765,8 @@ namespace xt
     template <class CT, class... S>
     inline void xview<CT, S...>::assign_temporary_impl(temporary_type&& tmp)
     {
-        constexpr bool fast_assign = detail::is_strided_view<xexpression_type, S...>::value
-                                     && xassign_traits<xview<CT, S...>, temporary_type>::simd_strided_assign();
+        constexpr bool fast_assign = detail::is_strided_view<xexpression_type, S...>::value && \
+                                     xassign_traits<xview<CT, S...>, temporary_type>::simd_strided_assign();
         xview_detail::run_assign_temporary_impl(*this, tmp, std::integral_constant<bool, fast_assign>{});
     }
 
@@ -1808,13 +1799,8 @@ namespace xt
         {
             // Checks that no ellipsis slice is used
             using view_type = xview<xtl::closure_type_t<E>, get_slice_type<std::decay_t<E>, S>...>;
-            return view_type(
-                std::forward<E>(e),
-                get_slice_implementation(
-                    e,
-                    std::forward<S>(slices),
-                    get_underlying_shape_index<std::decay_t<E>, S...>(I)
-                )...
+            return view_type(std::forward<E>(e),
+                get_slice_implementation(e, std::forward<S>(slices), get_underlying_shape_index<std::decay_t<E>, S...>(I))...
             );
         }
     }
@@ -1831,11 +1817,7 @@ namespace xt
     template <class E, class... S>
     inline auto view(E&& e, S&&... slices)
     {
-        return detail::make_view_impl(
-            std::forward<E>(e),
-            std::make_index_sequence<sizeof...(S)>(),
-            std::forward<S>(slices)...
-        );
+        return detail::make_view_impl(std::forward<E>(e), std::make_index_sequence<sizeof...(S)>(), std::forward<S>(slices)...);
     }
 
     namespace detail
@@ -1843,9 +1825,8 @@ namespace xt
         class row_impl
         {
         public:
-
-            template <class E>
-            inline static auto make(E&& e, const std::ptrdiff_t index)
+            template<class E>
+            static inline auto make(E&& e, const std::ptrdiff_t index)
             {
                 const auto shape = e.shape();
                 check_dimension(shape);
@@ -1853,21 +1834,17 @@ namespace xt
             }
 
         private:
-
-            template <class S>
-            inline static void check_dimension(const S& shape)
+            template<class S>
+            static inline void check_dimension(const S& shape)
             {
                 if (shape.size() != 2)
                 {
-                    XTENSOR_THROW(
-                        std::invalid_argument,
-                        "A row can only be accessed on an expression with exact two dimensions"
-                    );
+                    XTENSOR_THROW(std::invalid_argument, "A row can only be accessed on an expression with exact two dimensions");
                 }
             }
 
-            template <class T, std::size_t N>
-            inline static void check_dimension(const std::array<T, N>&)
+            template<class T, std::size_t N>
+            static inline void check_dimension(const std::array<T, N>&)
             {
                 static_assert(N == 2, "A row can only be accessed on an expression with exact two dimensions");
             }
@@ -1876,9 +1853,8 @@ namespace xt
         class column_impl
         {
         public:
-
-            template <class E>
-            inline static auto make(E&& e, const std::ptrdiff_t index)
+            template<class E>
+            static inline auto make(E&& e, const std::ptrdiff_t index)
             {
                 const auto shape = e.shape();
                 check_dimension(shape);
@@ -1886,21 +1862,17 @@ namespace xt
             }
 
         private:
-
-            template <class S>
-            inline static void check_dimension(const S& shape)
+            template<class S>
+            static inline void check_dimension(const S& shape)
             {
                 if (shape.size() != 2)
                 {
-                    XTENSOR_THROW(
-                        std::invalid_argument,
-                        "A column can only be accessed on an expression with exact two dimensions"
-                    );
+                    XTENSOR_THROW(std::invalid_argument, "A column can only be accessed on an expression with exact two dimensions");
                 }
             }
 
-            template <class T, std::size_t N>
-            inline static void check_dimension(const std::array<T, N>&)
+            template<class T, std::size_t N>
+            static inline void check_dimension(const std::array<T, N>&)
             {
                 static_assert(N == 2, "A column can only be accessed on an expression with exact two dimensions");
             }
@@ -1951,8 +1923,7 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l)
-        -> std::enable_if_t<!Enable, stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t<!Enable, stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return stepper(this, m_e.stepper_end(m_e.shape(), l), offset, true, l);
@@ -1960,8 +1931,7 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const
-        -> std::enable_if_t<!Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const -> std::enable_if_t<!Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         const xexpression_type& e = m_e;
@@ -1970,8 +1940,7 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const
-        -> std::enable_if_t<!Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const -> std::enable_if_t<!Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         const xexpression_type& e = m_e;
@@ -1988,8 +1957,7 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l)
-        -> std::enable_if_t<Enable, stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t<Enable, stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return stepper(this, data_xend(l, offset), offset);
@@ -1997,8 +1965,7 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const
-        -> std::enable_if_t<Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const -> std::enable_if_t<Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return const_stepper(this, data_xbegin(), offset);
@@ -2006,8 +1973,7 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const
-        -> std::enable_if_t<Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const-> std::enable_if_t<Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return const_stepper(this, data_xend(l, offset), offset);
@@ -2018,24 +1984,14 @@ namespace xt
      ********************************/
 
     template <bool is_const, class CT, class... S>
-    inline xview_stepper<is_const, CT, S...>::xview_stepper(
-        view_type* view,
-        substepper_type it,
-        size_type offset,
-        bool end,
-        layout_type l
-    )
-        : p_view(view)
-        , m_it(it)
-        , m_offset(offset)
+    inline xview_stepper<is_const, CT, S...>::xview_stepper(view_type* view, substepper_type it,
+                                                            size_type offset, bool end, layout_type l)
+        : p_view(view), m_it(it), m_offset(offset)
     {
         if (!end)
         {
             std::fill(m_index_keeper.begin(), m_index_keeper.end(), 0);
-            auto func = [](const auto& s) noexcept
-            {
-                return xt::value(s, 0);
-            };
+            auto func = [](const auto& s) noexcept { return xt::value(s, 0); };
             for (size_type i = 0; i < sizeof...(S); ++i)
             {
                 if (!is_newaxis_slice(i))
@@ -2061,18 +2017,14 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset)
-        {
-            m_it.step(index, offset);
-        };
+        auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); };
         common_step_forward(dim, func);
     }
 
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step_back(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset)
-        {
+        auto func = [this](size_type index, size_type offset) {
             m_it.step_back(index, offset);
         };
         common_step_backward(dim, func);
@@ -2081,18 +2033,14 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step(size_type dim, size_type n)
     {
-        auto func = [this](size_type index, size_type offset)
-        {
-            m_it.step(index, offset);
-        };
+        auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); };
         common_step_forward(dim, n, func);
     }
 
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step_back(size_type dim, size_type n)
     {
-        auto func = [this](size_type index, size_type offset)
-        {
+        auto func = [this](size_type index, size_type offset) {
             m_it.step_back(index, offset);
         };
         common_step_backward(dim, n, func);
@@ -2101,20 +2049,14 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::reset(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset)
-        {
-            m_it.step_back(index, offset);
-        };
+        auto func = [this](size_type index, size_type offset) { m_it.step_back(index, offset); };
         common_reset(dim, func, false);
     }
 
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::reset_back(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset)
-        {
-            m_it.step(index, offset);
-        };
+        auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); };
         common_reset(dim, func, true);
     }
 
@@ -2142,12 +2084,10 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::to_end_impl(layout_type l)
     {
-        auto func = [](const auto& s) noexcept
-        {
+        auto func = [](const auto& s) noexcept {
             return xt::value(s, get_size(s) - 1);
         };
-        auto size_func = [](const auto& s) noexcept
-        {
+        auto size_func = [](const auto& s) noexcept {
             return get_size(s);
         };
 
@@ -2197,15 +2137,14 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, this](const auto& s) noexcept
-            {
+            auto func = [&dim, this](const auto& s) noexcept {
                 return step_size(s, this->m_index_keeper[dim]++, 1);
             };
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
-                                                           : 1;
+                size_type step_size = index < sizeof...(S) ?
+                    apply<size_type>(index, func, p_view->slices()) : 1;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2218,8 +2157,7 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, &n, this](const auto& s) noexcept
-            {
+            auto func = [&dim, &n, this](const auto& s) noexcept {
                 auto st_size = step_size(s, this->m_index_keeper[dim], n);
                 this->m_index_keeper[dim] += n;
                 return size_type(st_size);
@@ -2228,8 +2166,8 @@ namespace xt
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
-                                                           : n;
+                size_type step_size = index < sizeof...(S) ?
+                    apply<size_type>(index, func, p_view->slices()) : n;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2242,16 +2180,15 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, this](const auto& s) noexcept
-            {
+            auto func = [&dim, this](const auto& s) noexcept {
                 this->m_index_keeper[dim]--;
                 return step_size(s, this->m_index_keeper[dim], 1);
             };
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
-                                                           : 1;
+                size_type step_size = index < sizeof...(S) ?
+                    apply<size_type>(index, func, p_view->slices()) : 1;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2264,8 +2201,7 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, &n, this](const auto& s) noexcept
-            {
+            auto func = [&dim, &n, this](const auto& s) noexcept {
                 this->m_index_keeper[dim] -= n;
                 return step_size(s, this->m_index_keeper[dim], n);
             };
@@ -2273,8 +2209,8 @@ namespace xt
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
-                                                           : n;
+                size_type step_size = index < sizeof...(S) ?
+                    apply<size_type>(index, func, p_view->slices()) : n;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2285,27 +2221,19 @@ namespace xt
     template <class F>
     void xview_stepper<is_const, CT, S...>::common_reset(size_type dim, F f, bool backwards)
     {
-        auto size_func = [](const auto& s) noexcept
-        {
-            return get_size(s);
-        };
-        auto end_func = [](const auto& s) noexcept
-        {
-            return xt::value(s, get_size(s) - 1) - xt::value(s, 0);
-        };
+        auto size_func = [](const auto& s) noexcept { return get_size(s); };
+        auto end_func = [](const auto& s) noexcept { return xt::value(s, get_size(s) - 1) - xt::value(s, 0); };
 
         size_type index = integral_skip<S...>(dim);
         if (!is_newaxis_slice(index))
         {
             if (dim < m_index_keeper.size())
             {
-                size_type size = index < sizeof...(S) ? apply<size_type>(index, size_func, p_view->slices())
-                                                      : p_view->shape()[dim];
+                size_type size = index < sizeof...(S) ? apply<size_type>(index, size_func, p_view->slices()) : p_view->shape()[dim];
                 m_index_keeper[dim] = backwards ? size - 1 : 0;
             }
 
-            size_type reset_n = index < sizeof...(S) ? apply<size_type>(index, end_func, p_view->slices())
-                                                     : p_view->shape()[dim] - 1;
+            size_type reset_n = index < sizeof...(S) ? apply<size_type>(index, end_func, p_view->slices()) : p_view->shape()[dim] - 1;
             index -= newaxis_count_before<S...>(index);
             f(index, reset_n);
         }

From 643b6016dca9019525ad6178305d89f004507d9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern.starruss@tu-dresden.de>
Date: Wed, 30 Nov 2022 15:54:38 +0100
Subject: [PATCH 03/33] Fix openMP parallel linear assign

---
 include/xtensor/xassign.hpp | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 7c67da273..5ce72a6e5 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -657,22 +657,22 @@ namespace xt
             }
         }
 #elif defined(XTENSOR_USE_OPENMP)
-        if (size >= XTENSOR_OPENMP_TRESHOLD)
+        if (size >= size_typeXTENSOR_OPENMP_TRESHOLD))
         {
-            #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
-    #ifndef _WIN32
+#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
+#ifndef _WIN32
             for (size_type i = align_begin; i < align_end; i += simd_size)
             {
                 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
             }
-    #else
+#else
             for(auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
                 i += static_cast<std::ptrdiff_t>(simd_size))
             {
                 size_type ui = static_cast<size_type>(i);
                 e1.template store_simd<lhs_align_mode>(ui, e2.template load_simd<rhs_align_mode, value_type>(ui));
             }
-    #endif
+#endif
         }
         else
         {
@@ -719,10 +719,21 @@ namespace xt
             *(dst + i) = static_cast<value_type>(*(src + i));
         });
 #elif defined(XTENSOR_USE_OPENMP)
-        #pragma omp parallel for default(none) shared(src, dst, n)
-        for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
+        if (n >= XTENSOR_OPENMP_TRESHOLD)
         {
-            *(dst + i) = static_cast<value_type>(*(src + i));
+#pragma omp parallel for default(none) shared(src, dst, n)
+            for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
+            {
+                *(dst + i) = static_cast<value_type>(*(src + i));
+            }
+        }
+        else {
+            for (; n > size_type(0); --n)
+            {
+                *dst = static_cast<value_type>(*src);
+                ++src;
+                ++dst;
+            }  
         }
 #else
         for (; n > size_type(0); --n)
@@ -1022,7 +1033,7 @@ namespace xt
         }
 // 		std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <<std::endl;
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-		if (outer_loop_size>50) {
+		if (outer_loop_size>20) {
 			std::size_t first_step = true;
 #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
 			for (std::size_t ox = 0; ox < outer_loop_size; ++ox)

From 8e75d92156308c7b67cc586c362dc481b79dc239 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern@starruss.de>
Date: Wed, 30 Nov 2022 18:02:08 +0100
Subject: [PATCH 04/33] Fix typo on xassign.hpp

---
 include/xtensor/xassign.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 5ce72a6e5..dd8d79713 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -657,7 +657,7 @@ namespace xt
             }
         }
 #elif defined(XTENSOR_USE_OPENMP)
-        if (size >= size_typeXTENSOR_OPENMP_TRESHOLD))
+        if (size >= size_type(XTENSOR_OPENMP_TRESHOLD))
         {
 #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
 #ifndef _WIN32

From 63381a8dba4678758f79b8b354db6aac30059617 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern.starruss@tu-dresden.de>
Date: Thu, 4 Feb 2021 13:09:55 +0100
Subject: [PATCH 05/33] initial thoughts parallel strided assign

---
 include/xtensor/xassign.hpp | 316 +++++++++++++++++++++++-------------
 1 file changed, 205 insertions(+), 111 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index dd8d79713..fbf2dfcfd 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -13,7 +13,6 @@
 #include <algorithm>
 #include <type_traits>
 #include <utility>
-#include <functional>
 
 #include <xtl/xcomplex.hpp>
 #include <xtl/xsequence.hpp>
@@ -247,7 +246,7 @@ namespace xt
         inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t<has_strides<E1>::value, bool>
         {
             return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>()) ||
-                   (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
+                   (e1.layout() != layout_type::dynamic && e2.has_linear_assign(e1.strides()));
         }
 
         template <class E1, class E2>
@@ -343,7 +342,7 @@ namespace xt
         template <class T>
         static constexpr bool simd_size_impl() { return xt_simd::simd_traits<T>::size > 1 || (is_bool<T>::value && use_xsimd()); }
         static constexpr bool simd_size() { return simd_size_impl<e1_value_type>() && simd_size_impl<e2_value_type>(); }
-        static constexpr bool simd_interface() { return has_simd_interface<E1, requested_value_type>() &&
+        static constexpr bool simd_interface() { return has_simd_interface<E1, requested_value_type>() && 
                                                         has_simd_interface<E2, requested_value_type>(); }
 
     public:
@@ -395,7 +394,7 @@ namespace xt
                 linear_assigner<false>::run(de1, de2);
             }
         }
-        else if (simd_strided_assign && de1.layout() == de2.layout())
+        else if (simd_strided_assign)
         {
             strided_loop_assigner<simd_strided_assign>::run(de1, de2);
         }
@@ -419,20 +418,16 @@ namespace xt
     inline void xexpression_assigner<Tag>::computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2)
     {
         using shape_type = typename E1::shape_type;
-        using comperator_type = std::greater<typename shape_type::value_type>;
-
         using size_type = typename E1::size_type;
 
         E1& de1 = e1.derived_cast();
         const E2& de2 = e2.derived_cast();
 
-        size_type dim2 = de2.dimension();
-        shape_type shape = uninitialized_shape<shape_type>(dim2);
-
+        size_type dim = de2.dimension();
+        shape_type shape = uninitialized_shape<shape_type>(dim);
         bool trivial_broadcast = de2.broadcast_shape(shape, true);
 
-        auto && de1_shape = de1.shape();
-        if (dim2 > de1.dimension() || std::lexicographical_compare(shape.begin(), shape.end(), de1_shape.begin(), de1_shape.end(), comperator_type()))
+        if (dim > de1.dimension() || shape > de1.shape())
         {
             typename E1::temporary_type tmp(shape);
             base_type::assign_data(tmp, e2, trivial_broadcast);
@@ -620,7 +615,6 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<simd_assign>::run(E1& e1, const E2& e2)
     {
-// 		std::cout << "lin" << std::endl;
         using lhs_align_mode = xt_simd::container_alignment_t<E1>;
         constexpr bool is_aligned = std::is_same<lhs_align_mode, aligned_mode>::value;
         using rhs_align_mode = std::conditional_t<is_aligned, inner_aligned_mode, unaligned_mode>;
@@ -642,37 +636,27 @@ namespace xt
         }
 
 #if defined(XTENSOR_USE_TBB)
-        if (size >= XTENSOR_TBB_THRESHOLD)
-        {
-            tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i)
-            {
-                e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
-            });
-        }
-        else
+        tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i)
         {
-            for (size_type i = align_begin; i < align_end; i += simd_size)
-            {
-                e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
-            }
-        }
+            e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
+        });
 #elif defined(XTENSOR_USE_OPENMP)
-        if (size >= size_type(XTENSOR_OPENMP_TRESHOLD))
+        if (size >= XTENSOR_OPENMP_TRESHOLD)
         {
-#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
-#ifndef _WIN32
+            #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
+    #ifndef _WIN32
             for (size_type i = align_begin; i < align_end; i += simd_size)
             {
                 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
             }
-#else
+    #else
             for(auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
                 i += static_cast<std::ptrdiff_t>(simd_size))
             {
                 size_type ui = static_cast<size_type>(i);
                 e1.template store_simd<lhs_align_mode>(ui, e2.template load_simd<rhs_align_mode, value_type>(ui));
             }
-#endif
+    #endif
         }
         else
         {
@@ -712,28 +696,17 @@ namespace xt
         auto src = linear_begin(e2);
         auto dst = linear_begin(e1);
         size_type n = e1.size();
-// 		std::cout << "Linear assigner LOOP SIZE is " << n <<std::endl;
+
 #if defined(XTENSOR_USE_TBB)
         tbb::parallel_for(std::ptrdiff_t(0), static_cast<std::ptrdiff_t>(n), [&](std::ptrdiff_t i)
         {
             *(dst + i) = static_cast<value_type>(*(src + i));
         });
 #elif defined(XTENSOR_USE_OPENMP)
-        if (n >= XTENSOR_OPENMP_TRESHOLD)
+        #pragma omp parallel for default(none) shared(src, dst, n)
+        for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
         {
-#pragma omp parallel for default(none) shared(src, dst, n)
-            for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
-            {
-                *(dst + i) = static_cast<value_type>(*(src + i));
-            }
-        }
-        else {
-            for (; n > size_type(0); --n)
-            {
-                *dst = static_cast<value_type>(*src);
-                ++src;
-                ++dst;
-            }  
+            *(dst + i) = static_cast<value_type>(*(src + i));
         }
 #else
         for (; n > size_type(0); --n)
@@ -783,29 +756,75 @@ namespace xt
                 }
             }
                      
+            template <class T>
+            static void nth_next_idx(size_t n, T&outer_index, T& outer_shape)
+			{
+				auto i = outer_index.size()-1;
+				auto stride_sizes = full_like(outer_shape,0);
+				stride_sizes[i] = 1;
+				// fill up dimensions in the outer_idx as required
+				for (; i >= 0; i--)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
+						outer_index[i] += d_idx;
+						n-=d_idx * stride_sizes[i];
+						// we filled up all dimensions required
+						break;
+					}
+					else if (outer_index[i] !=0) {
+						outer_index[i] = 0;
+						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
+						if (i>0) {
+							outer_index[i-1]++;
+							// compute stride sizes on the fly ...
+							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+					else {
+						if (i>0) {
+							// compute stride sizes on the fly ...
+							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+                }
+                i++;
+				// add remaining iterations
+				for (; i < outer_index.size(); i++)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i])) {
+						outer_index[i] += d_idx;
+						n-=d_idx * stride_sizes[i];
+					}
+					else {
+						// that may not happen
+					}
+                }
+			}
+			
 			template <class T>
-            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
+            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
 			{
+				// assume outer_index is filled with 0
+				auto stride_sizes = full_like(outer_shape,0);
 				
-				dynamic_shape<std::size_t> stride_sizes;
-				xt::resize_container(stride_sizes, outer_shape.size());
+				auto i = outer_index.size()-1;
+				stride_sizes[i] = 1;
+				i--
 				// compute strides
-				using size_type = typename T::size_type;
-				for ( size_type i = outer_shape.size(); i>0; i--) {
-					stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
-				}
+				for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; }
 				
 				// compute index
-				for (size_type i=0; i < outer_shape.size(); i++)
+				for (i=0; i < outer_index.size(); i++)
                 {
 					auto d_idx = n/stride_sizes[i];
 					if (d_idx < outer_shape[i]) {
-						outer_index[i] = d_idx;
+						outer_index[i] += d_idx;
 						n -= d_idx * stride_sizes[i];
 					}
 					else {
 						// that may not happen
-						outer_index[i] = 0;
 					}
                 }
 			}
@@ -835,27 +854,80 @@ namespace xt
                 }
             }
             
+            			template <class T>
+            static void nth_next_idx(size_t n, T& outer_index, T& outer_shape)
+			{
+				using size_type = typename T::size_type;
+				size_type i = 0;
+				auto stride_sizes = outer_shape;
+				stride_sizes[i] = 1;
+				// fill up dimensions in the outer_idx as required
+				for (; i < outer_index.size(); i++)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i])) {
+						outer_index[i] += d_idx;
+						n-=d_idx * stride_sizes[i];
+						// we filled up all dimensions required
+						break;
+					}
+					else if (outer_index[i] != 0) {
+						outer_index[i] = 0;
+						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
+						
+						if (i+1<outer_index.size()) {
+							// compute stride sizes on the fly ...
+							outer_index[i+1]++;
+							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+					else {
+						if (i+1<outer_index.size()) {
+							// compute stride sizes on the fly ...
+							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
+						}
+					}
+                }
+                i--;
+				// add remaining iterations
+				for (; i >= 0; i--)
+                {
+					auto d_idx = n/stride_sizes[i];
+					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
+						outer_index[i] += d_idx;
+						n -= d_idx * stride_sizes[i];
+					}
+					else {
+						// that may not happen
+					}
+                }
+			}
+			
 			template <class T>
-            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
+            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
 			{
-				dynamic_shape<std::size_t> stride_sizes;
-				xt::resize_container(stride_sizes, outer_shape.size());
-				
+				// assume outer_index is filled with 0
 				using size_type = typename T::size_type;
+				auto stride_sizes = outer_shape;
+				size_type i = 0;
+				stride_sizes[i] = 1;
+				i++;
 				
 				// compute required strides
-				for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; }
+				for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; }
 				
 				// compute index
-				for (size_type i = outer_shape.size(); i>0;)
-				{
-					i--;
+                i = outer_shape.size()-1;
+				for (; i >= 0; i--)
+                {
 					auto d_idx = n/stride_sizes[i];
 					if (d_idx<(outer_shape[i]-1)) {
-						outer_index[i] = d_idx;
+						outer_index[i] += d_idx;
 						n -= d_idx * stride_sizes[i];
 					}
-// 					else {}   // that may not happen
+					else {
+						// that may not happen
+					}
                 }
 			}
         };
@@ -924,13 +996,11 @@ namespace xt
             {
                 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
-				if (cut<e1.strides().size()-1) cut = e1.strides().size()-1;
             }
             else if (E1::static_layout == layout_type::column_major || !is_row_major)
             {
                 auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
-				if (cut>1) cut = 1;
             } // can't reach here because this would have already triggered the fallback
 
             using shape_value_type = typename E1::shape_type::value_type;
@@ -950,6 +1020,7 @@ namespace xt
         }
     }
 
+#define strided_row_major_fallback
 #define strided_parallel_assign
     template <bool simd>
     template <class E1, class E2>
@@ -957,10 +1028,10 @@ namespace xt
     {
         bool is_row_major = true;
         using fallback_assigner = stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>;
+
         if (E1::static_layout == layout_type::dynamic)
         {
             layout_type dynamic_layout = e1.layout();
-
             switch (dynamic_layout)
             {
                 case layout_type::row_major:
@@ -970,7 +1041,11 @@ namespace xt
                     is_row_major = false;
                     break;
                 default:
+#ifdef strided_row_major_fallback
+					is_row_major = true;
+#else
                     return fallback_assigner(e1, e2).run();
+#endif
             }
         }
         else if (E1::static_layout == layout_type::row_major)
@@ -1031,55 +1106,75 @@ namespace xt
         {
             step_dim = cut;
         }
-// 		std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <<std::endl;
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-		if (outer_loop_size>20) {
-			std::size_t first_step = true;
+		static bool reported = false;
+		if (!reported) {
+			std::cout << "SIMD size is " << simd_type::size << " step_dim " << step_dim << std::endl;
+			reported = true;
+		}
+		std::cout << "+";
+		std::size_t first_step = true;
+// #define old_parallel
 #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
-			for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
-			{
-				if (first_step) {
-					is_row_major ?
-						strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox,idx, max_shape) :
-						strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox,idx, max_shape);
+        for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
+        {
+			if (first_step) {
+				is_row_major ?
+					strided_assign_detail::idx_tools<layout_type::row_major>::nth_next_idx(ox,idx, max_shape) :
+					strided_assign_detail::idx_tools<layout_type::column_major>::nth_next_idx(ox,idx, max_shape);
+				for (std::size_t i = 0; i < idx.size(); ++i)
+                {
+                    fct_stepper.step(i + step_dim, idx[i]);
+                    res_stepper.step(i + step_dim, idx[i]);
+                }
+                first_step = false;
+			}
+			else {
+				// next unaligned index
+				if (is_row_major) {
+					auto i = max_shape.size()-1;
+// 					fct_stepper.step_back(i+1,inner_loop_size);
+// 					res_stepper.step_back(i+1,inner_loop_size);
+					fct_stepper.reset(i+1);
+					res_stepper.reset(i+1);
+					fct_stepper.step_back(i+1,1);
+					res_stepper.step_back(i+1,1);
 					
-					for (std::size_t i = 0; i < idx.size(); ++i)
+			
+					for (; i >= 0; --i)
 					{
-						fct_stepper.step(i + step_dim, idx[i]);
-						res_stepper.step(i + step_dim, idx[i]);
+						if (idx[i] + 1 >= max_shape[i])
+						{
+							idx[i] = 0;
+							fct_stepper.reset(i);
+							res_stepper.reset(i);
+						}
+						else
+						{
+							idx[i]++;
+							fct_stepper.step(i, 1);
+							res_stepper.step(i, 1);
+							break;
+						}
 					}
-					first_step = false;
-				}
-
-				for (std::size_t i = 0; i < simd_size; ++i)
-				{
-					res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
 				}
-				for (std::size_t i = 0; i < simd_rest; ++i)
-				{
-					*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
-					res_stepper.step_leading();
-					fct_stepper.step_leading();
-				}
-				
-				// next unaligned index
-				is_row_major ?
-					strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
-					strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
-					
-				// need to step E1 as well if not contigous assign (e.g. view)
-				fct_stepper.to_begin();
-				res_stepper.to_begin();
-				for (std::size_t i = 0; i < idx.size(); ++i)
-				{
-					fct_stepper.step(i + step_dim, idx[i]);
-					res_stepper.step(i + step_dim, idx[i]);
+				else {
+					std::cout << "oops" << std::endl;
 				}
+// 				std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<<idx.size() << "," << cut << std::endl;
 			}
-		}
-		else {
-		
-#endif
+			for (std::size_t i = 0; i < simd_size; ++i)
+			{
+				res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
+			}
+			for (std::size_t i = 0; i < simd_rest; ++i)
+			{
+				*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+				res_stepper.step_leading();
+				fct_stepper.step_leading();
+			}
+        }
+#else
         for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
         {
             for (std::size_t i = 0; i < simd_size; ++i)
@@ -1117,8 +1212,7 @@ namespace xt
                 }
             }
         }
-#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-		}
+
 #endif
 	}
 

From 01e156e94ede732a5f43229991b24d379eb1c54f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern.starruss@tu-dresden.de>
Date: Thu, 27 May 2021 23:30:24 +0200
Subject: [PATCH 06/33] trigger strided assign properly

---
 include/xtensor/xassign.hpp | 260 ++++++++++--------------------------
 1 file changed, 70 insertions(+), 190 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index fbf2dfcfd..99797bcb7 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -246,7 +246,7 @@ namespace xt
         inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t<has_strides<E1>::value, bool>
         {
             return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>()) ||
-                   (e1.layout() != layout_type::dynamic && e2.has_linear_assign(e1.strides()));
+                   (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
         }
 
         template <class E1, class E2>
@@ -394,7 +394,7 @@ namespace xt
                 linear_assigner<false>::run(de1, de2);
             }
         }
-        else if (simd_strided_assign)
+        else if (simd_strided_assign && de1.layout() == de2.layout())
         {
             strided_loop_assigner<simd_strided_assign>::run(de1, de2);
         }
@@ -615,6 +615,7 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<simd_assign>::run(E1& e1, const E2& e2)
     {
+// 		std::cout << "lin" << std::endl;
         using lhs_align_mode = xt_simd::container_alignment_t<E1>;
         constexpr bool is_aligned = std::is_same<lhs_align_mode, aligned_mode>::value;
         using rhs_align_mode = std::conditional_t<is_aligned, inner_aligned_mode, unaligned_mode>;
@@ -696,7 +697,7 @@ namespace xt
         auto src = linear_begin(e2);
         auto dst = linear_begin(e1);
         size_type n = e1.size();
-
+// 		std::cout << "Linear assigner LOOP SIZE is " << n <<std::endl;
 #if defined(XTENSOR_USE_TBB)
         tbb::parallel_for(std::ptrdiff_t(0), static_cast<std::ptrdiff_t>(n), [&](std::ptrdiff_t i)
         {
@@ -756,75 +757,29 @@ namespace xt
                 }
             }
                      
-            template <class T>
-            static void nth_next_idx(size_t n, T&outer_index, T& outer_shape)
-			{
-				auto i = outer_index.size()-1;
-				auto stride_sizes = full_like(outer_shape,0);
-				stride_sizes[i] = 1;
-				// fill up dimensions in the outer_idx as required
-				for (; i >= 0; i--)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
-						outer_index[i] += d_idx;
-						n-=d_idx * stride_sizes[i];
-						// we filled up all dimensions required
-						break;
-					}
-					else if (outer_index[i] !=0) {
-						outer_index[i] = 0;
-						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
-						if (i>0) {
-							outer_index[i-1]++;
-							// compute stride sizes on the fly ...
-							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-					else {
-						if (i>0) {
-							// compute stride sizes on the fly ...
-							stride_sizes[i-1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-                }
-                i++;
-				// add remaining iterations
-				for (; i < outer_index.size(); i++)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i])) {
-						outer_index[i] += d_idx;
-						n-=d_idx * stride_sizes[i];
-					}
-					else {
-						// that may not happen
-					}
-                }
-			}
-			
 			template <class T>
-            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
+            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
 			{
-				// assume outer_index is filled with 0
-				auto stride_sizes = full_like(outer_shape,0);
 				
-				auto i = outer_index.size()-1;
-				stride_sizes[i] = 1;
-				i--
+				dynamic_shape<std::size_t> stride_sizes;
+				xt::resize_container(stride_sizes, outer_shape.size());
 				// compute strides
-				for (; i >= 0; i--) { stride_sizes[i] = stride_sizes[i+1] * outer_shape[i+1]; }
+				using size_type = typename T::size_type;
+				for ( size_type i = outer_shape.size(); i>0; i--) {
+					stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
+				}
 				
 				// compute index
-				for (i=0; i < outer_index.size(); i++)
+				for (size_type i=0; i < outer_shape.size(); i++)
                 {
 					auto d_idx = n/stride_sizes[i];
 					if (d_idx < outer_shape[i]) {
-						outer_index[i] += d_idx;
+						outer_index[i] = d_idx;
 						n -= d_idx * stride_sizes[i];
 					}
 					else {
 						// that may not happen
+						outer_index[i] = 0;
 					}
                 }
 			}
@@ -854,80 +809,27 @@ namespace xt
                 }
             }
             
-            			template <class T>
-            static void nth_next_idx(size_t n, T& outer_index, T& outer_shape)
-			{
-				using size_type = typename T::size_type;
-				size_type i = 0;
-				auto stride_sizes = outer_shape;
-				stride_sizes[i] = 1;
-				// fill up dimensions in the outer_idx as required
-				for (; i < outer_index.size(); i++)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i])) {
-						outer_index[i] += d_idx;
-						n-=d_idx * stride_sizes[i];
-						// we filled up all dimensions required
-						break;
-					}
-					else if (outer_index[i] != 0) {
-						outer_index[i] = 0;
-						n-=(outer_shape[i]-outer_index[i]-1) * stride_sizes[i];
-						
-						if (i+1<outer_index.size()) {
-							// compute stride sizes on the fly ...
-							outer_index[i+1]++;
-							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-					else {
-						if (i+1<outer_index.size()) {
-							// compute stride sizes on the fly ...
-							stride_sizes[i+1] = stride_sizes[i] * outer_shape[i];
-						}
-					}
-                }
-                i--;
-				// add remaining iterations
-				for (; i >= 0; i--)
-                {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-outer_index[i]-1)) {
-						outer_index[i] += d_idx;
-						n -= d_idx * stride_sizes[i];
-					}
-					else {
-						// that may not happen
-					}
-                }
-			}
-			
 			template <class T>
-            static void nth_idx(size_t n, T& outer_index, T& outer_shape)
+            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
 			{
-				// assume outer_index is filled with 0
+				dynamic_shape<std::size_t> stride_sizes;
+				xt::resize_container(stride_sizes, outer_shape.size());
+				
 				using size_type = typename T::size_type;
-				auto stride_sizes = outer_shape;
-				size_type i = 0;
-				stride_sizes[i] = 1;
-				i++;
 				
 				// compute required strides
-				for (; i < outer_shape.size(); i++) { stride_sizes[i] = stride_sizes[i-1] * outer_shape[i-1]; }
+				for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; }
 				
 				// compute index
-                i = outer_shape.size()-1;
-				for (; i >= 0; i--)
-                {
+				for (size_type i = outer_shape.size(); i>0;)
+				{
+					i--;
 					auto d_idx = n/stride_sizes[i];
 					if (d_idx<(outer_shape[i]-1)) {
-						outer_index[i] += d_idx;
+						outer_index[i] = d_idx;
 						n -= d_idx * stride_sizes[i];
 					}
-					else {
-						// that may not happen
-					}
+// 					else {}   // that may not happen
                 }
 			}
         };
@@ -996,11 +898,13 @@ namespace xt
             {
                 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
+				if (cut<e1.strides().size()-1) cut = e1.strides().size()-1;
             }
             else if (E1::static_layout == layout_type::column_major || !is_row_major)
             {
                 auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
+				if (cut>1) cut = 1;
             } // can't reach here because this would have already triggered the fallback
 
             using shape_value_type = typename E1::shape_type::value_type;
@@ -1020,7 +924,6 @@ namespace xt
         }
     }
 
-#define strided_row_major_fallback
 #define strided_parallel_assign
     template <bool simd>
     template <class E1, class E2>
@@ -1028,10 +931,10 @@ namespace xt
     {
         bool is_row_major = true;
         using fallback_assigner = stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>;
-
         if (E1::static_layout == layout_type::dynamic)
         {
             layout_type dynamic_layout = e1.layout();
+
             switch (dynamic_layout)
             {
                 case layout_type::row_major:
@@ -1041,11 +944,7 @@ namespace xt
                     is_row_major = false;
                     break;
                 default:
-#ifdef strided_row_major_fallback
-					is_row_major = true;
-#else
                     return fallback_assigner(e1, e2).run();
-#endif
             }
         }
         else if (E1::static_layout == layout_type::row_major)
@@ -1106,75 +1005,55 @@ namespace xt
         {
             step_dim = cut;
         }
+// 		std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <<std::endl;
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-		static bool reported = false;
-		if (!reported) {
-			std::cout << "SIMD size is " << simd_type::size << " step_dim " << step_dim << std::endl;
-			reported = true;
-		}
-		std::cout << "+";
-		std::size_t first_step = true;
-// #define old_parallel
+		if (outer_loop_size>50) {
+			std::size_t first_step = true;
 #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
-        for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
-        {
-			if (first_step) {
-				is_row_major ?
-					strided_assign_detail::idx_tools<layout_type::row_major>::nth_next_idx(ox,idx, max_shape) :
-					strided_assign_detail::idx_tools<layout_type::column_major>::nth_next_idx(ox,idx, max_shape);
-				for (std::size_t i = 0; i < idx.size(); ++i)
-                {
-                    fct_stepper.step(i + step_dim, idx[i]);
-                    res_stepper.step(i + step_dim, idx[i]);
-                }
-                first_step = false;
-			}
-			else {
-				// next unaligned index
-				if (is_row_major) {
-					auto i = max_shape.size()-1;
-// 					fct_stepper.step_back(i+1,inner_loop_size);
-// 					res_stepper.step_back(i+1,inner_loop_size);
-					fct_stepper.reset(i+1);
-					res_stepper.reset(i+1);
-					fct_stepper.step_back(i+1,1);
-					res_stepper.step_back(i+1,1);
+			for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
+			{
+				if (first_step) {
+					is_row_major ?
+						strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox,idx, max_shape) :
+						strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox,idx, max_shape);
 					
-			
-					for (; i >= 0; --i)
+					for (std::size_t i = 0; i < idx.size(); ++i)
 					{
-						if (idx[i] + 1 >= max_shape[i])
-						{
-							idx[i] = 0;
-							fct_stepper.reset(i);
-							res_stepper.reset(i);
-						}
-						else
-						{
-							idx[i]++;
-							fct_stepper.step(i, 1);
-							res_stepper.step(i, 1);
-							break;
-						}
+						fct_stepper.step(i + step_dim, idx[i]);
+						res_stepper.step(i + step_dim, idx[i]);
 					}
+					first_step = false;
 				}
-				else {
-					std::cout << "oops" << std::endl;
+
+				for (std::size_t i = 0; i < simd_size; ++i)
+				{
+					res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
+				}
+				for (std::size_t i = 0; i < simd_rest; ++i)
+				{
+					*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+					res_stepper.step_leading();
+					fct_stepper.step_leading();
+				}
+				
+				// next unaligned index
+				is_row_major ?
+					strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
+					strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+					
+				// need to step E1 as well if not contigous assign (e.g. view)
+				fct_stepper.to_begin();
+				res_stepper.to_begin();
+				for (std::size_t i = 0; i < idx.size(); ++i)
+				{
+					fct_stepper.step(i + step_dim, idx[i]);
+					res_stepper.step(i + step_dim, idx[i]);
 				}
-// 				std:: cout << "Next index is " << idx[0] << "," << idx[1] << ","<<idx.size() << "," << cut << std::endl;
-			}
-			for (std::size_t i = 0; i < simd_size; ++i)
-			{
-				res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
-			}
-			for (std::size_t i = 0; i < simd_rest; ++i)
-			{
-				*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
-				res_stepper.step_leading();
-				fct_stepper.step_leading();
 			}
-        }
-#else
+		}
+		else {
+		
+#endif
         for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
         {
             for (std::size_t i = 0; i < simd_size; ++i)
@@ -1212,7 +1091,8 @@ namespace xt
                 }
             }
         }
-
+#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
+		}
 #endif
 	}
 

From da29eda99c086a83dbedba35a7d10ac4011137ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern.starruss@tu-dresden.de>
Date: Wed, 30 Nov 2022 15:54:38 +0100
Subject: [PATCH 07/33] Fix openMP parallel linear assign

---
 include/xtensor/xassign.hpp | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 99797bcb7..3e1f0af9e 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -642,22 +642,22 @@ namespace xt
             e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
         });
 #elif defined(XTENSOR_USE_OPENMP)
-        if (size >= XTENSOR_OPENMP_TRESHOLD)
+        if (size >= size_typeXTENSOR_OPENMP_TRESHOLD))
         {
-            #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
-    #ifndef _WIN32
+#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
+#ifndef _WIN32
             for (size_type i = align_begin; i < align_end; i += simd_size)
             {
                 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
             }
-    #else
+#else
             for(auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
                 i += static_cast<std::ptrdiff_t>(simd_size))
             {
                 size_type ui = static_cast<size_type>(i);
                 e1.template store_simd<lhs_align_mode>(ui, e2.template load_simd<rhs_align_mode, value_type>(ui));
             }
-    #endif
+#endif
         }
         else
         {
@@ -704,10 +704,21 @@ namespace xt
             *(dst + i) = static_cast<value_type>(*(src + i));
         });
 #elif defined(XTENSOR_USE_OPENMP)
-        #pragma omp parallel for default(none) shared(src, dst, n)
-        for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
+        if (n >= XTENSOR_OPENMP_TRESHOLD)
         {
-            *(dst + i) = static_cast<value_type>(*(src + i));
+#pragma omp parallel for default(none) shared(src, dst, n)
+            for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
+            {
+                *(dst + i) = static_cast<value_type>(*(src + i));
+            }
+        }
+        else {
+            for (; n > size_type(0); --n)
+            {
+                *dst = static_cast<value_type>(*src);
+                ++src;
+                ++dst;
+            }  
         }
 #else
         for (; n > size_type(0); --n)
@@ -1007,7 +1018,7 @@ namespace xt
         }
 // 		std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <<std::endl;
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-		if (outer_loop_size>50) {
+		if (outer_loop_size>20) {
 			std::size_t first_step = true;
 #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
 			for (std::size_t ox = 0; ox < outer_loop_size; ++ox)

From 1531f81d5ba318f96367c9e7b7d8862ca80536c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Starru=C3=9F?= <joern@starruss.de>
Date: Wed, 30 Nov 2022 18:02:08 +0100
Subject: [PATCH 08/33] Fix typo on xassign.hpp

---
 include/xtensor/xassign.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 3e1f0af9e..87dc9e3c5 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -642,7 +642,7 @@ namespace xt
             e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
         });
 #elif defined(XTENSOR_USE_OPENMP)
-        if (size >= size_typeXTENSOR_OPENMP_TRESHOLD))
+        if (size >= size_type(XTENSOR_OPENMP_TRESHOLD))
         {
 #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
 #ifndef _WIN32

From 198b3852c7289124a35f2cd91ef2df345d6e330d Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Wed, 1 Feb 2023 02:55:20 +0100
Subject: [PATCH 09/33] Fix wrong merge

---
 include/xtensor/xview.hpp | 624 ++++++++++++++++++++++----------------
 1 file changed, 356 insertions(+), 268 deletions(-)

diff --git a/include/xtensor/xview.hpp b/include/xtensor/xview.hpp
index 931bb67a9..52b7c33c0 100644
--- a/include/xtensor/xview.hpp
+++ b/include/xtensor/xview.hpp
@@ -1,11 +1,11 @@
 /***************************************************************************
-* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
-* Copyright (c) QuantStack                                                 *
-*                                                                          *
-* Distributed under the terms of the BSD 3-Clause License.                 *
-*                                                                          *
-* The full license is in the file LICENSE, distributed with this software. *
-****************************************************************************/
+ * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
+ * Copyright (c) QuantStack                                                 *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
 
 #ifndef XTENSOR_VIEW_HPP
 #define XTENSOR_VIEW_HPP
@@ -18,8 +18,8 @@
 #include <utility>
 
 #include <xtl/xclosure.hpp>
-#include <xtl/xsequence.hpp>
 #include <xtl/xmeta_utils.hpp>
+#include <xtl/xsequence.hpp>
 #include <xtl/xtype_traits.hpp>
 
 #include "xaccessible.hpp"
@@ -34,7 +34,6 @@
 #include "xtensor_forward.hpp"
 #include "xview_utils.hpp"
 
-
 namespace xt
 {
 
@@ -76,26 +75,22 @@ namespace xt
     {
 
         template <class T>
-        struct is_xrange
-            : std::false_type
+        struct is_xrange : std::false_type
         {
         };
 
         template <class T>
-        struct is_xrange<xrange<T>>
-            : std::true_type
+        struct is_xrange<xrange<T>> : std::true_type
         {
         };
 
         template <class S>
-        struct is_xall_slice
-            : std::false_type
+        struct is_xall_slice : std::false_type
         {
         };
 
         template <class T>
-        struct is_xall_slice<xall<T>>
-            : std::true_type
+        struct is_xall_slice<xall<T>> : std::true_type
         {
         };
 
@@ -134,32 +129,33 @@ namespace xt
         template <class CT, class... S>
         struct is_xscalar_impl<xview<CT, S...>>
         {
-            static constexpr bool value = static_cast<std::ptrdiff_t>(integral_count<S...>()) == static_dimension<typename std::decay_t<CT>::shape_type>::value ? true : false;
+            static constexpr bool value = static_cast<std::ptrdiff_t>(integral_count<S...>()
+                                          ) == static_dimension<typename std::decay_t<CT>::shape_type>::value
+                                              ? true
+                                              : false;
         };
 
         template <class S>
-        struct is_strided_slice_impl
-            : std::true_type
+        struct is_strided_slice_impl : std::true_type
         {
         };
 
         template <class T>
-        struct is_strided_slice_impl<xkeep_slice<T>>
-            : std::false_type
+        struct is_strided_slice_impl<xkeep_slice<T>> : std::false_type
         {
         };
 
         template <class T>
-        struct is_strided_slice_impl<xdrop_slice<T>>
-            : std::false_type
+        struct is_strided_slice_impl<xdrop_slice<T>> : std::false_type
         {
         };
 
         // If we have no discontiguous slices, we can calculate strides for this view.
         template <class E, class... S>
         struct is_strided_view
-            : std::integral_constant<bool, xtl::conjunction<has_data_interface<E>,
-                                                            is_strided_slice_impl<std::decay_t<S>>...>::value>
+            : std::integral_constant<
+                  bool,
+                  xtl::conjunction<has_data_interface<E>, is_strided_slice_impl<std::decay_t<S>>...>::value>
         {
         };
 
@@ -183,7 +179,10 @@ namespace xt
             static constexpr bool have_all_seen = all_seen || is_all_slice;
             static constexpr bool have_range_seen = is_range_slice;
 
-            static constexpr bool is_valid = valid && (have_all_seen ? is_all_slice : (!range_seen && (is_int_slice || is_range_slice)));
+            static constexpr bool is_valid = valid
+                                             && (have_all_seen
+                                                     ? is_all_slice
+                                                     : (!range_seen && (is_int_slice || is_range_slice)));
 
             static constexpr bool value = is_contiguous_view_impl<layout_type::row_major,
                                                                   is_valid,
@@ -217,12 +216,13 @@ namespace xt
 
             static constexpr bool have_int_seen = int_seen || is_int_slice;
 
-            static constexpr bool is_valid = valid && (have_int_seen ? is_int_slice : (!range_seen && (is_all_slice || is_range_slice)));
-            static constexpr bool value = is_contiguous_view_impl<layout_type::column_major,
-                                                                  is_valid,
-                                                                  have_int_seen,
-                                                                  is_range_slice || range_seen,
-                                                                  xtl::mpl::pop_front_t<V>>::value;
+            static constexpr bool is_valid = valid
+                                             && (have_int_seen
+                                                     ? is_int_slice
+                                                     : (!range_seen && (is_all_slice || is_range_slice)));
+            static constexpr bool value = is_contiguous_view_impl < layout_type::column_major, is_valid,
+                                  have_int_seen, is_range_slice || range_seen,
+                                  xtl::mpl::pop_front_t < V >> ::value;
         };
 
         template <bool valid, bool int_seen, bool range_seen>
@@ -234,11 +234,14 @@ namespace xt
         // TODO relax has_data_interface constraint here!
         template <class E, class... S>
         struct is_contiguous_view
-            : std::integral_constant<bool,
-                has_data_interface<E>::value &&
-                !(E::static_layout == layout_type::column_major && static_cast<std::size_t>(static_dimension<typename E::shape_type>::value) != sizeof...(S)) &&
-                is_contiguous_view_impl<E::static_layout, true, false, false, xtl::mpl::vector<S...>>::value
-              >
+            : std::integral_constant<
+                  bool,
+                  has_data_interface<E>::value
+                      && !(
+                          E::static_layout == layout_type::column_major
+                          && static_cast<std::size_t>(static_dimension<typename E::shape_type>::value) != sizeof...(S)
+                      )
+                      && is_contiguous_view_impl<E::static_layout, true, false, false, xtl::mpl::vector<S...>>::value>
         {
         };
 
@@ -276,21 +279,20 @@ namespace xt
         struct get_contigous_shape_type
         {
             // if we have no `range` in the slices we can re-use the shape with an offset
-            using type = std::conditional_t<xtl::disjunction<is_xrange<S>...>::value,
-                                            typename xview_shape_type<typename E::shape_type, S...>::type,
-                                            // In the false branch we know that we have only integers at the front OR end, and NO range
-                                            typename unwrap_offset_container<E::static_layout, typename E::inner_shape_type, integral_count<S...>()>::type>;
+            using type = std::conditional_t<
+                xtl::disjunction<is_xrange<S>...>::value,
+                typename xview_shape_type<typename E::shape_type, S...>::type,
+                // In the false branch we know that we have only integers at the front OR end, and NO range
+                typename unwrap_offset_container<E::static_layout, typename E::inner_shape_type, integral_count<S...>()>::type>;
         };
 
         template <class T>
-        struct is_sequence_view
-            : std::integral_constant<bool, false>
+        struct is_sequence_view : std::integral_constant<bool, false>
         {
         };
 
         template <class T, std::ptrdiff_t S, std::ptrdiff_t E>
-        struct is_sequence_view<sequence_view<T, S, E>>
-            : std::integral_constant<bool, true>
+        struct is_sequence_view<sequence_view<T, S, E>> : std::integral_constant<bool, true>
         {
         };
     }
@@ -304,15 +306,16 @@ namespace xt
         using size_type = typename xexpression_type::size_type;
         using temporary_type = view_temporary_type_t<xexpression_type, S...>;
 
-        static constexpr layout_type layout =
-            detail::is_contiguous_view<xexpression_type, S...>::value ?
-                xexpression_type::static_layout : layout_type::dynamic;
+        static constexpr layout_type layout = detail::is_contiguous_view<xexpression_type, S...>::value
+                                                  ? xexpression_type::static_layout
+                                                  : layout_type::dynamic;
 
         static constexpr bool is_const = std::is_const<std::remove_reference_t<CT>>::value;
 
-        using extract_storage_type = xtl::mpl::eval_if_t<has_data_interface<xexpression_type>,
-                                                         detail::expr_storage_type<xexpression_type>,
-                                                         make_invalid_type<>>;
+        using extract_storage_type = xtl::mpl::eval_if_t<
+            has_data_interface<xexpression_type>,
+            detail::expr_storage_type<xexpression_type>,
+            make_invalid_type<>>;
         using storage_type = std::conditional_t<is_const, const extract_storage_type, extract_storage_type>;
     };
 
@@ -324,17 +327,20 @@ namespace xt
         static constexpr bool is_strided_view = detail::is_strided_view<xexpression_type, S...>::value;
         static constexpr bool is_contiguous_view = detail::is_contiguous_view<xexpression_type, S...>::value;
 
-        using inner_shape_type = std::conditional_t<is_contiguous_view,
-                                                    typename detail::get_contigous_shape_type<xexpression_type, S...>::type,
-                                                    typename xview_shape_type<typename xexpression_type::shape_type, S...>::type>;
+        using inner_shape_type = std::conditional_t<
+            is_contiguous_view,
+            typename detail::get_contigous_shape_type<xexpression_type, S...>::type,
+            typename xview_shape_type<typename xexpression_type::shape_type, S...>::type>;
 
-        using stepper = std::conditional_t<is_strided_view,
-                                           xstepper<xview<CT, S...>>,
-                                           xview_stepper<std::is_const<std::remove_reference_t<CT>>::value, CT, S...>>;
+        using stepper = std::conditional_t<
+            is_strided_view,
+            xstepper<xview<CT, S...>>,
+            xview_stepper<std::is_const<std::remove_reference_t<CT>>::value, CT, S...>>;
 
-        using const_stepper = std::conditional_t<is_strided_view,
-                                                 xstepper<const xview<CT, S...>>,
-                                                 xview_stepper<true, std::remove_cv_t<CT>, S...>>;
+        using const_stepper = std::conditional_t<
+            is_strided_view,
+            xstepper<const xview<CT, S...>>,
+            xview_stepper<true, std::remove_cv_t<CT>, S...>>;
     };
 
     /**
@@ -354,9 +360,9 @@ namespace xt
     template <class CT, class... S>
     class xview : public xview_semantic<xview<CT, S...>>,
                   public std::conditional_t<
-                    detail::is_contiguous_view<std::decay_t<CT>, S...>::value,
-                    xcontiguous_iterable<xview<CT, S...>>,
-                    xiterable<xview<CT, S...>>>,
+                      detail::is_contiguous_view<std::decay_t<CT>, S...>::value,
+                      xcontiguous_iterable<xview<CT, S...>>,
+                      xiterable<xview<CT, S...>>>,
                   public xaccessible<xview<CT, S...>>,
                   public extension::xview_base_t<CT, S...>
     {
@@ -378,9 +384,8 @@ namespace xt
         using bool_load_type = typename xexpression_type::bool_load_type;
         using reference = typename inner_types::reference;
         using const_reference = typename inner_types::const_reference;
-        using pointer = std::conditional_t<is_const,
-                                           typename xexpression_type::const_pointer,
-                                           typename xexpression_type::pointer>;
+        using pointer = std::
+            conditional_t<is_const, typename xexpression_type::const_pointer, typename xexpression_type::pointer>;
         using const_pointer = typename xexpression_type::const_pointer;
         using size_type = typename inner_types::size_type;
         using difference_type = typename xexpression_type::difference_type;
@@ -395,28 +400,35 @@ namespace xt
         using inner_shape_type = typename iterable_base::inner_shape_type;
         using shape_type = typename xview_shape_type<typename xexpression_type::shape_type, S...>::type;
 
-        using xexpression_inner_strides_type = xtl::mpl::eval_if_t<has_strides<xexpression_type>,
-                                                                   detail::expr_inner_strides_type<xexpression_type>,
-                                                                   get_strides_type<shape_type>>;
+        using xexpression_inner_strides_type = xtl::mpl::eval_if_t<
+            has_strides<xexpression_type>,
+            detail::expr_inner_strides_type<xexpression_type>,
+            get_strides_type<shape_type>>;
 
-        using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t<has_strides<xexpression_type>,
-                                                                       detail::expr_inner_backstrides_type<xexpression_type>,
-                                                                       get_strides_type<shape_type>>;
+        using xexpression_inner_backstrides_type = xtl::mpl::eval_if_t<
+            has_strides<xexpression_type>,
+            detail::expr_inner_backstrides_type<xexpression_type>,
+            get_strides_type<shape_type>>;
 
         using storage_type = typename inner_types::storage_type;
 
-        static constexpr bool has_trivial_strides = is_contiguous_view && !xtl::disjunction<detail::is_xrange<S>...>::value;
-        using inner_strides_type = std::conditional_t<has_trivial_strides,
-                                                      typename detail::unwrap_offset_container<xexpression_type::static_layout,
-                                                                                               xexpression_inner_strides_type,
-                                                                                               integral_count<S...>()>::type,
-                                                      get_strides_t<shape_type>>;
-
-        using inner_backstrides_type = std::conditional_t<has_trivial_strides,
-                                                          typename detail::unwrap_offset_container<xexpression_type::static_layout,
-                                                                                                   xexpression_inner_backstrides_type,
-                                                                                                   integral_count<S...>()>::type,
-                                                          get_strides_t<shape_type>>;
+        static constexpr bool has_trivial_strides = is_contiguous_view
+                                                    && !xtl::disjunction<detail::is_xrange<S>...>::value;
+        using inner_strides_type = std::conditional_t<
+            has_trivial_strides,
+            typename detail::unwrap_offset_container<
+                xexpression_type::static_layout,
+                xexpression_inner_strides_type,
+                integral_count<S...>()>::type,
+            get_strides_t<shape_type>>;
+
+        using inner_backstrides_type = std::conditional_t<
+            has_trivial_strides,
+            typename detail::unwrap_offset_container<
+                xexpression_type::static_layout,
+                xexpression_inner_backstrides_type,
+                integral_count<S...>()>::type,
+            get_strides_t<shape_type>>;
 
         using strides_type = get_strides_t<shape_type>;
         using backstrides_type = strides_type;
@@ -427,21 +439,21 @@ namespace xt
         using stepper = typename iterable_base::stepper;
         using const_stepper = typename iterable_base::const_stepper;
 
-        using linear_iterator = std::conditional_t<has_data_interface<xexpression_type>::value && is_strided_view,
-                                                    std::conditional_t<is_const,
-                                                        typename xexpression_type::const_linear_iterator,
-                                                        typename xexpression_type::linear_iterator>,
-                                                    typename iterable_base::linear_iterator>;
-        using const_linear_iterator = std::conditional_t<has_data_interface<xexpression_type>::value && is_strided_view,
-                                                          typename xexpression_type::const_linear_iterator,
-                                                          typename iterable_base::const_linear_iterator>;
+        using linear_iterator = std::conditional_t<
+            has_data_interface<xexpression_type>::value && is_strided_view,
+            std::conditional_t<is_const, typename xexpression_type::const_linear_iterator, typename xexpression_type::linear_iterator>,
+            typename iterable_base::linear_iterator>;
+        using const_linear_iterator = std::conditional_t<
+            has_data_interface<xexpression_type>::value && is_strided_view,
+            typename xexpression_type::const_linear_iterator,
+            typename iterable_base::const_linear_iterator>;
 
         using reverse_linear_iterator = std::reverse_iterator<linear_iterator>;
         using const_reverse_linear_iterator = std::reverse_iterator<const_linear_iterator>;
 
         using container_iterator = pointer;
         using const_container_iterator = const_pointer;
-        constexpr static std::size_t rank = SIZE_MAX;
+        static constexpr std::size_t rank = SIZE_MAX;
 
         // The FSL argument prevents the compiler from calling this constructor
         // instead of the copy constructor when sizeof...(SL) == 0.
@@ -490,48 +502,36 @@ namespace xt
         bool has_linear_assign(const ST& strides) const;
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, stepper>
-        stepper_begin(const ST& shape);
+        std::enable_if_t<!Enable, stepper> stepper_begin(const ST& shape);
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, stepper>
-        stepper_end(const ST& shape, layout_type l);
+        std::enable_if_t<!Enable, stepper> stepper_end(const ST& shape, layout_type l);
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, const_stepper>
-        stepper_begin(const ST& shape) const;
+        std::enable_if_t<!Enable, const_stepper> stepper_begin(const ST& shape) const;
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<!Enable, const_stepper>
-        stepper_end(const ST& shape, layout_type l) const;
+        std::enable_if_t<!Enable, const_stepper> stepper_end(const ST& shape, layout_type l) const;
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, stepper>
-        stepper_begin(const ST& shape);
+        std::enable_if_t<Enable, stepper> stepper_begin(const ST& shape);
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, stepper>
-        stepper_end(const ST& shape, layout_type l);
+        std::enable_if_t<Enable, stepper> stepper_end(const ST& shape, layout_type l);
 
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, const_stepper>
-        stepper_begin(const ST& shape) const;
+        std::enable_if_t<Enable, const_stepper> stepper_begin(const ST& shape) const;
         template <class ST, bool Enable = is_strided_view>
-        std::enable_if_t<Enable, const_stepper>
-        stepper_end(const ST& shape, layout_type l) const;
+        std::enable_if_t<Enable, const_stepper> stepper_end(const ST& shape, layout_type l) const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value, storage_type&>
-        storage();
+        std::enable_if_t<has_data_interface<T>::value, storage_type&> storage();
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value, const storage_type&>
-        storage() const;
+        std::enable_if_t<has_data_interface<T>::value, const storage_type&> storage() const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator>
-        linear_begin();
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator> linear_begin();
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator>
-        linear_end();
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, linear_iterator> linear_end();
 
         template <class T = xexpression_type>
         std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_linear_iterator>
@@ -582,12 +582,10 @@ namespace xt
         backstrides() const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer>
-        data() const;
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer> data() const;
 
         template <class T = xexpression_type>
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer>
-        data();
+        std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer> data();
 
         template <class T = xexpression_type>
         std::enable_if_t<has_data_interface<T>::value && is_strided_view, std::size_t>
@@ -620,11 +618,13 @@ namespace xt
         size_type underlying_size(size_type dim) const;
 
         xtl::xclosure_pointer<self_type&> operator&() &;
-        xtl::xclosure_pointer<const self_type&> operator&() const &;
+        xtl::xclosure_pointer<const self_type&> operator&() const&;
         xtl::xclosure_pointer<self_type> operator&() &&;
 
-        template <class E, class T = xexpression_type,
-                  class = std::enable_if_t<has_data_interface<T>::value && is_contiguous_view, int>>
+        template <
+            class E,
+            class T = xexpression_type,
+            class = std::enable_if_t<has_data_interface<T>::value && is_contiguous_view, int>>
         void assign_to(xexpression<E>& e, bool force_resize) const;
 
         template <class E>
@@ -646,9 +646,11 @@ namespace xt
         template <class align, class simd, class T = xexpression_type>
         enable_simd_interface<T, void> store_simd(size_type i, const simd& e);
 
-        template <class align, class requested_type = value_type,
-                  std::size_t N = xt_simd::simd_traits<requested_type>::size,
-                  class T = xexpression_type>
+        template <
+            class align,
+            class requested_type = value_type,
+            std::size_t N = xt_simd::simd_traits<requested_type>::size,
+            class T = xexpression_type>
         enable_simd_interface<T, simd_return_type<requested_type>> load_simd(size_type i) const;
 
         template <class T = xexpression_type>
@@ -789,9 +791,7 @@ namespace xt
     {
     public:
 
-        using view_type = std::conditional_t<is_const,
-                                             const xview<CT, S...>,
-                                             xview<CT, S...>>;
+        using view_type = std::conditional_t<is_const, const xview<CT, S...>, xview<CT, S...>>;
         using substepper_type = get_stepper<view_type>;
 
         using value_type = typename substepper_type::value_type;
@@ -803,8 +803,13 @@ namespace xt
         using shape_type = typename substepper_type::shape_type;
 
         xview_stepper() = default;
-        xview_stepper(view_type* view, substepper_type it,
-                      size_type offset, bool end = false, layout_type l = XTENSOR_DEFAULT_TRAVERSAL);
+        xview_stepper(
+            view_type* view,
+            substepper_type it,
+            size_type offset,
+            bool end = false,
+            layout_type l = XTENSOR_DEFAULT_TRAVERSAL
+        );
 
         reference operator*() const;
 
@@ -887,7 +892,7 @@ namespace xt
             std::forward<CTA>(e),
             std::forward<FSL>(first_slice),
             std::forward<SL>(slices)...
-          )
+        )
     {
     }
 
@@ -895,25 +900,26 @@ namespace xt
     template <class CT, class... S>
     template <class CTA, class FSL, class... SL>
     xview<CT, S...>::xview(std::true_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept
-        : m_e(std::forward<CTA>(e)),
-          m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...),
-          m_shape(compute_shape(detail::is_sequence_view<inner_shape_type>{})),
-          m_strides(m_e.strides()),
-          m_backstrides(m_e.backstrides()),
-          m_data_offset(data_offset_impl(std::make_index_sequence<sizeof...(S)>())),
-          m_strides_computed(true)
+        : m_e(std::forward<CTA>(e))
+        , m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...)
+        , m_shape(compute_shape(detail::is_sequence_view<inner_shape_type>{}))
+        , m_strides(m_e.strides())
+        , m_backstrides(m_e.backstrides())
+        , m_data_offset(data_offset_impl(std::make_index_sequence<sizeof...(S)>()))
+        , m_strides_computed(true)
     {
     }
 
     template <class CT, class... S>
     template <class CTA, class FSL, class... SL>
     xview<CT, S...>::xview(std::false_type, CTA&& e, FSL&& first_slice, SL&&... slices) noexcept
-        : m_e(std::forward<CTA>(e)),
-          m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...),
-          m_shape(compute_shape(std::false_type{})),
-          m_strides_computed(false)
+        : m_e(std::forward<CTA>(e))
+        , m_slices(std::forward<FSL>(first_slice), std::forward<SL>(slices)...)
+        , m_shape(compute_shape(std::false_type{}))
+        , m_strides_computed(false)
     {
     }
+
     //@}
 
     template <class CT, class... S>
@@ -936,6 +942,7 @@ namespace xt
     {
         return semantic_base::operator=(e);
     }
+
     //@}
 
     template <class CT, class... S>
@@ -974,13 +981,8 @@ namespace xt
     template <class CT, class... S>
     inline layout_type xview<CT, S...>::layout() const noexcept
     {
-        return xtl::mpl::static_if<is_strided_view>([&](auto self)
-        {
-            if (static_layout != layout_type::dynamic)
-            {
-                return static_layout;
-            }
-            else
+        return xtl::mpl::static_if<is_strided_view>(
+            [&](auto self)
             {
 				return self(this)->m_e.layout()==layout_type::row_major ?
 					((self(this)->strides()[self(this)->strides().size()-1] == 1) ? self(this)->m_e.layout() : layout_type::dynamic) :
@@ -988,8 +990,7 @@ namespace xt
 					
                 bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true);
                 return strides_match ? self(this)->m_e.layout() : layout_type::dynamic;
-            }
-        },
+            },
         /* else */ [&](auto /*self*/)
         {
 			std::cout << "View is not strided!" << std::endl;
@@ -1035,13 +1036,17 @@ namespace xt
     template <class T>
     inline void xview<CT, S...>::fill(const T& value)
     {
-        xtl::mpl::static_if<static_layout != layout_type::dynamic>([&](auto self)
-        {
-            std::fill(self(this)->linear_begin(), self(this)->linear_end(), value);
-        }, /*else*/ [&](auto self)
-        {
-            std::fill(self(this)->begin(), self(this)->end(), value);
-        });
+        xtl::mpl::static_if<static_layout != layout_type::dynamic>(
+            [&](auto self)
+            {
+                std::fill(self(this)->linear_begin(), self(this)->linear_end(), value);
+            },
+            /*else*/
+            [&](auto self)
+            {
+                std::fill(self(this)->begin(), self(this)->end(), value);
+            }
+        );
     }
 
     /**
@@ -1087,7 +1092,6 @@ namespace xt
         return unchecked_impl(make_index_sequence(args...), static_cast<size_type>(args)...);
     }
 
-
     template <class CT, class... S>
     template <class It>
     inline auto xview<CT, S...>::element(It first, It last) -> reference
@@ -1175,16 +1179,15 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::storage() ->
-        std::enable_if_t<has_data_interface<T>::value, storage_type&>
+    inline auto xview<CT, S...>::storage() -> std::enable_if_t<has_data_interface<T>::value, storage_type&>
     {
         return m_e.storage();
     }
 
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::storage() const ->
-        std::enable_if_t<has_data_interface<T>::value, const storage_type&>
+    inline auto xview<CT, S...>::storage() const
+        -> std::enable_if_t<has_data_interface<T>::value, const storage_type&>
     {
         return m_e.storage();
     }
@@ -1290,8 +1293,8 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::strides() const ->
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
+    inline auto xview<CT, S...>::strides() const
+        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
     {
         if (!m_strides_computed)
         {
@@ -1303,8 +1306,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::backstrides() const ->
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
+    inline auto xview<CT, S...>::backstrides() const
+        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, const inner_strides_type&>
     {
         if (!m_strides_computed)
         {
@@ -1319,16 +1322,16 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::data() const ->
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer>
+    inline auto xview<CT, S...>::data() const
+        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, const_pointer>
     {
         return m_e.data();
     }
 
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::data() ->
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer>
+    inline auto xview<CT, S...>::data()
+        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, pointer>
     {
         return m_e.data();
     }
@@ -1337,9 +1340,9 @@ namespace xt
     template <std::size_t... I>
     inline std::size_t xview<CT, S...>::data_offset_impl(std::index_sequence<I...>) const noexcept
     {
-        auto temp = std::array<std::ptrdiff_t, sizeof...(S)>({
-            (static_cast<ptrdiff_t>(xt::value(std::get<I>(m_slices), 0)))...
-        });
+        auto temp = std::array<std::ptrdiff_t, sizeof...(S)>(
+            {(static_cast<ptrdiff_t>(xt::value(std::get<I>(m_slices), 0)))...}
+        );
 
         std::ptrdiff_t result = 0;
         std::size_t i = 0;
@@ -1359,8 +1362,8 @@ namespace xt
      */
     template <class CT, class... S>
     template <class T>
-    inline auto xview<CT, S...>::data_offset() const noexcept ->
-        std::enable_if_t<has_data_interface<T>::value && is_strided_view, std::size_t>
+    inline auto xview<CT, S...>::data_offset() const noexcept
+        -> std::enable_if_t<has_data_interface<T>::value && is_strided_view, std::size_t>
     {
         if (!m_strides_computed)
         {
@@ -1369,6 +1372,7 @@ namespace xt
         }
         return m_data_offset;
     }
+
     //@}
 
     template <class CT, class... S>
@@ -1384,7 +1388,7 @@ namespace xt
     }
 
     template <class CT, class... S>
-    inline auto xview<CT, S...>::operator&() const & -> xtl::xclosure_pointer<const self_type&>
+    inline auto xview<CT, S...>::operator&() const& -> xtl::xclosure_pointer<const self_type&>
     {
         return xtl::closure_pointer(*this);
     }
@@ -1421,15 +1425,20 @@ namespace xt
     template <class ST>
     inline bool xview<CT, S...>::has_linear_assign(const ST& str) const
     {
-        return xtl::mpl::static_if<is_strided_view>([&](auto self)
-        {
-            return str.size() == self(this)->strides().size() &&
-                std::equal(str.cbegin(), str.cend(), self(this)->strides().begin());
-
-        }, /*else*/ [](auto /*self*/){
-            return false;
-        });
+        return xtl::mpl::static_if<is_strided_view>(
+            [&](auto self)
+            {
+                return str.size() == self(this)->strides().size()
+                       && std::equal(str.cbegin(), str.cend(), self(this)->strides().begin());
+            },
+            /*else*/
+            [](auto /*self*/)
+            {
+                return false;
+            }
+        );
     }
+
     //@}
 
     template <class CT, class... S>
@@ -1465,7 +1474,8 @@ namespace xt
     }
 
     template <class CT, class... S>
-    inline auto xview<CT, S...>::data_xend(layout_type l, size_type offset) const noexcept -> const_container_iterator
+    inline auto xview<CT, S...>::data_xend(layout_type l, size_type offset) const noexcept
+        -> const_container_iterator
     {
         return data_xend_impl(data() + data_offset(), l, offset);
     }
@@ -1503,7 +1513,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class align, class requested_type, std::size_t N, class T>
-    inline auto xview<CT, S...>::load_simd(size_type i) const -> enable_simd_interface<T, simd_return_type<requested_type>>
+    inline auto xview<CT, S...>::load_simd(size_type i) const
+        -> enable_simd_interface<T, simd_return_type<requested_type>>
     {
         return m_e.template load_simd<xt_simd::unaligned_mode, requested_type>(data_offset() + i);
     }
@@ -1542,9 +1553,10 @@ namespace xt
     template <class... Args>
     inline auto xview<CT, S...>::make_index_sequence(Args...) const noexcept
     {
-        return std::make_index_sequence<(sizeof...(Args)+integral_count<S...>() > newaxis_count<S...>() ?
-                                         sizeof...(Args)+integral_count<S...>() - newaxis_count<S...>() :
-                                         0)>();
+        return std::make_index_sequence<
+            (sizeof...(Args) + integral_count<S...>() > newaxis_count<S...>()
+                 ? sizeof...(Args) + integral_count<S...>() - newaxis_count<S...>()
+                 : 0)>();
     }
 
     template <class CT, class... S>
@@ -1552,11 +1564,12 @@ namespace xt
     inline auto xview<CT, S...>::compute_strides_impl(std::index_sequence<I...>) const noexcept
     {
         std::size_t original_dim = m_e.dimension();
-        return std::array<std::ptrdiff_t, sizeof...(I)>({
-            (static_cast<std::ptrdiff_t>(xt::step_size(std::get<integral_skip<S...>(I)>(m_slices), 1)) *
-                ((integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))) < original_dim ?
-                m_e.strides()[integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))] : 1))...
-        });
+        return std::array<std::ptrdiff_t, sizeof...(I)>(
+            {(static_cast<std::ptrdiff_t>(xt::step_size(std::get<integral_skip<S...>(I)>(m_slices), 1))
+              * ((integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))) < original_dim
+                     ? m_e.strides()[integral_skip<S...>(I) - newaxis_count_before<S...>(integral_skip<S...>(I))]
+                     : 1))...}
+        );
     }
 
     template <class CT, class... S>
@@ -1632,7 +1645,8 @@ namespace xt
 
     template <class CT, class... S>
     template <typename std::decay_t<CT>::size_type... I, class... Args>
-    inline auto xview<CT, S...>::unchecked_impl(std::index_sequence<I...>, Args... args) const -> const_reference
+    inline auto xview<CT, S...>::unchecked_impl(std::index_sequence<I...>, Args... args) const
+        -> const_reference
     {
         return m_e.unchecked(index<I>(args...)...);
     }
@@ -1653,15 +1667,19 @@ namespace xt
 
     template <class CT, class... S>
     template <typename std::decay_t<CT>::size_type I, class... Args>
-    inline auto xview<CT, S...>::index(Args... args) const -> std::enable_if_t<lesser_condition<I>::value, size_type>
+    inline auto xview<CT, S...>::index(Args... args) const
+        -> std::enable_if_t<lesser_condition<I>::value, size_type>
     {
-        return sliced_access<I - integral_count_before<S...>(I) + newaxis_count_before<S...>(I + 1)>
-            (std::get<I + newaxis_count_before<S...>(I + 1)>(m_slices), args...);
+        return sliced_access<I - integral_count_before<S...>(I) + newaxis_count_before<S...>(I + 1)>(
+            std::get<I + newaxis_count_before<S...>(I + 1)>(m_slices),
+            args...
+        );
     }
 
     template <class CT, class... S>
     template <typename std::decay_t<CT>::size_type I, class... Args>
-    inline auto xview<CT, S...>::index(Args... args) const -> std::enable_if_t<!lesser_condition<I>::value, size_type>
+    inline auto xview<CT, S...>::index(Args... args) const
+        -> std::enable_if_t<!lesser_condition<I>::value, size_type>
     {
         return argument<I - integral_count<S...>() + newaxis_count<S...>()>(args...);
     }
@@ -1678,7 +1696,8 @@ namespace xt
     inline auto xview<CT, S...>::sliced_access(const xslice<T>& slice, Arg arg, Args... args) const -> size_type
     {
         using ST = typename T::size_type;
-        return static_cast<size_type>(slice.derived_cast()(argument<I>(static_cast<ST>(arg), static_cast<ST>(args)...)));
+        return static_cast<size_type>(slice.derived_cast(
+        )(argument<I>(static_cast<ST>(arg), static_cast<ST>(args)...)));
     }
 
     template <class CT, class... S>
@@ -1695,14 +1714,17 @@ namespace xt
         auto index = xtl::make_sequence<base_index_type>(m_e.dimension(), 0);
         using diff_type = typename std::iterator_traits<It>::difference_type;
         using ivalue_type = typename base_index_type::value_type;
-        auto func1 = [&first](const auto& s) noexcept {
+        auto func1 = [&first](const auto& s) noexcept
+        {
             return get_slice_value(s, first);
         };
-        auto func2 = [](const auto& s) noexcept {
+        auto func2 = [](const auto& s) noexcept
+        {
             return xt::value(s, 0);
         };
 
-        auto s = static_cast<diff_type>((std::min)(static_cast<size_type>(std::distance(first, last)), this->dimension()));
+        auto s = static_cast<diff_type>((std::min
+        )(static_cast<size_type>(std::distance(first, last)), this->dimension()));
         auto first_copy = last - s;
         for (size_type i = 0; i != m_e.dimension(); ++i)
         {
@@ -1714,13 +1736,12 @@ namespace xt
 
             if (first < last)
             {
-                index[i] = k < sizeof...(S) ?
-                    apply<size_type>(k, func1, m_slices) : static_cast<ivalue_type>(*first);
+                index[i] = k < sizeof...(S) ? apply<size_type>(k, func1, m_slices)
+                                            : static_cast<ivalue_type>(*first);
             }
             else
             {
-                index[i] = k < sizeof...(S) ?
-                    apply<size_type>(k, func2, m_slices) : ivalue_type(0);
+                index[i] = k < sizeof...(S) ? apply<size_type>(k, func2, m_slices) : ivalue_type(0);
             }
         }
         return index;
@@ -1737,12 +1758,15 @@ namespace xt
     {
         std::size_t dim = m_e.dimension() - integral_count<S...>() + newaxis_count<S...>();
         auto shape = xtl::make_sequence<inner_shape_type>(dim, 0);
-        auto func = [](const auto& s) noexcept { return get_size(s); };
+        auto func = [](const auto& s) noexcept
+        {
+            return get_size(s);
+        };
         for (size_type i = 0; i != dim; ++i)
         {
             size_type index = integral_skip<S...>(i);
-            shape[i] = index < sizeof...(S) ?
-                apply<size_type>(index, func, m_slices) : m_e.shape()[index - newaxis_count_before<S...>(index)];
+            shape[i] = index < sizeof...(S) ? apply<size_type>(index, func, m_slices)
+                                            : m_e.shape()[index - newaxis_count_before<S...>(index)];
         }
         return shape;
     }
@@ -1756,7 +1780,8 @@ namespace xt
         }
 
         template <class V, class T>
-        inline void run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */)
+        inline void
+        run_assign_temporary_impl(V& v, const T& t, std::false_type /* fallback to iterator assign */)
         {
             std::copy(t.cbegin(), t.cend(), v.begin());
         }
@@ -1765,8 +1790,8 @@ namespace xt
     template <class CT, class... S>
     inline void xview<CT, S...>::assign_temporary_impl(temporary_type&& tmp)
     {
-        constexpr bool fast_assign = detail::is_strided_view<xexpression_type, S...>::value && \
-                                     xassign_traits<xview<CT, S...>, temporary_type>::simd_strided_assign();
+        constexpr bool fast_assign = detail::is_strided_view<xexpression_type, S...>::value
+                                     && xassign_traits<xview<CT, S...>, temporary_type>::simd_strided_assign();
         xview_detail::run_assign_temporary_impl(*this, tmp, std::integral_constant<bool, fast_assign>{});
     }
 
@@ -1799,8 +1824,13 @@ namespace xt
         {
             // Checks that no ellipsis slice is used
             using view_type = xview<xtl::closure_type_t<E>, get_slice_type<std::decay_t<E>, S>...>;
-            return view_type(std::forward<E>(e),
-                get_slice_implementation(e, std::forward<S>(slices), get_underlying_shape_index<std::decay_t<E>, S...>(I))...
+            return view_type(
+                std::forward<E>(e),
+                get_slice_implementation(
+                    e,
+                    std::forward<S>(slices),
+                    get_underlying_shape_index<std::decay_t<E>, S...>(I)
+                )...
             );
         }
     }
@@ -1817,7 +1847,11 @@ namespace xt
     template <class E, class... S>
     inline auto view(E&& e, S&&... slices)
     {
-        return detail::make_view_impl(std::forward<E>(e), std::make_index_sequence<sizeof...(S)>(), std::forward<S>(slices)...);
+        return detail::make_view_impl(
+            std::forward<E>(e),
+            std::make_index_sequence<sizeof...(S)>(),
+            std::forward<S>(slices)...
+        );
     }
 
     namespace detail
@@ -1825,8 +1859,9 @@ namespace xt
         class row_impl
         {
         public:
-            template<class E>
-            static inline auto make(E&& e, const std::ptrdiff_t index)
+
+            template <class E>
+            inline static auto make(E&& e, const std::ptrdiff_t index)
             {
                 const auto shape = e.shape();
                 check_dimension(shape);
@@ -1834,17 +1869,21 @@ namespace xt
             }
 
         private:
-            template<class S>
-            static inline void check_dimension(const S& shape)
+
+            template <class S>
+            inline static void check_dimension(const S& shape)
             {
                 if (shape.size() != 2)
                 {
-                    XTENSOR_THROW(std::invalid_argument, "A row can only be accessed on an expression with exact two dimensions");
+                    XTENSOR_THROW(
+                        std::invalid_argument,
+                        "A row can only be accessed on an expression with exact two dimensions"
+                    );
                 }
             }
 
-            template<class T, std::size_t N>
-            static inline void check_dimension(const std::array<T, N>&)
+            template <class T, std::size_t N>
+            inline static void check_dimension(const std::array<T, N>&)
             {
                 static_assert(N == 2, "A row can only be accessed on an expression with exact two dimensions");
             }
@@ -1853,8 +1892,9 @@ namespace xt
         class column_impl
         {
         public:
-            template<class E>
-            static inline auto make(E&& e, const std::ptrdiff_t index)
+
+            template <class E>
+            inline static auto make(E&& e, const std::ptrdiff_t index)
             {
                 const auto shape = e.shape();
                 check_dimension(shape);
@@ -1862,17 +1902,21 @@ namespace xt
             }
 
         private:
-            template<class S>
-            static inline void check_dimension(const S& shape)
+
+            template <class S>
+            inline static void check_dimension(const S& shape)
             {
                 if (shape.size() != 2)
                 {
-                    XTENSOR_THROW(std::invalid_argument, "A column can only be accessed on an expression with exact two dimensions");
+                    XTENSOR_THROW(
+                        std::invalid_argument,
+                        "A column can only be accessed on an expression with exact two dimensions"
+                    );
                 }
             }
 
-            template<class T, std::size_t N>
-            static inline void check_dimension(const std::array<T, N>&)
+            template <class T, std::size_t N>
+            inline static void check_dimension(const std::array<T, N>&)
             {
                 static_assert(N == 2, "A column can only be accessed on an expression with exact two dimensions");
             }
@@ -1923,7 +1967,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t<!Enable, stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l)
+        -> std::enable_if_t<!Enable, stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return stepper(this, m_e.stepper_end(m_e.shape(), l), offset, true, l);
@@ -1931,7 +1976,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const -> std::enable_if_t<!Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const
+        -> std::enable_if_t<!Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         const xexpression_type& e = m_e;
@@ -1940,7 +1986,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const -> std::enable_if_t<!Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const
+        -> std::enable_if_t<!Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         const xexpression_type& e = m_e;
@@ -1957,7 +2004,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) -> std::enable_if_t<Enable, stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l)
+        -> std::enable_if_t<Enable, stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return stepper(this, data_xend(l, offset), offset);
@@ -1965,7 +2013,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const -> std::enable_if_t<Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_begin(const ST& shape) const
+        -> std::enable_if_t<Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return const_stepper(this, data_xbegin(), offset);
@@ -1973,7 +2022,8 @@ namespace xt
 
     template <class CT, class... S>
     template <class ST, bool Enable>
-    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const-> std::enable_if_t<Enable, const_stepper>
+    inline auto xview<CT, S...>::stepper_end(const ST& shape, layout_type l) const
+        -> std::enable_if_t<Enable, const_stepper>
     {
         size_type offset = shape.size() - this->dimension();
         return const_stepper(this, data_xend(l, offset), offset);
@@ -1984,14 +2034,24 @@ namespace xt
      ********************************/
 
     template <bool is_const, class CT, class... S>
-    inline xview_stepper<is_const, CT, S...>::xview_stepper(view_type* view, substepper_type it,
-                                                            size_type offset, bool end, layout_type l)
-        : p_view(view), m_it(it), m_offset(offset)
+    inline xview_stepper<is_const, CT, S...>::xview_stepper(
+        view_type* view,
+        substepper_type it,
+        size_type offset,
+        bool end,
+        layout_type l
+    )
+        : p_view(view)
+        , m_it(it)
+        , m_offset(offset)
     {
         if (!end)
         {
             std::fill(m_index_keeper.begin(), m_index_keeper.end(), 0);
-            auto func = [](const auto& s) noexcept { return xt::value(s, 0); };
+            auto func = [](const auto& s) noexcept
+            {
+                return xt::value(s, 0);
+            };
             for (size_type i = 0; i < sizeof...(S); ++i)
             {
                 if (!is_newaxis_slice(i))
@@ -2017,14 +2077,18 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); };
+        auto func = [this](size_type index, size_type offset)
+        {
+            m_it.step(index, offset);
+        };
         common_step_forward(dim, func);
     }
 
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step_back(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset) {
+        auto func = [this](size_type index, size_type offset)
+        {
             m_it.step_back(index, offset);
         };
         common_step_backward(dim, func);
@@ -2033,14 +2097,18 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step(size_type dim, size_type n)
     {
-        auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); };
+        auto func = [this](size_type index, size_type offset)
+        {
+            m_it.step(index, offset);
+        };
         common_step_forward(dim, n, func);
     }
 
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::step_back(size_type dim, size_type n)
     {
-        auto func = [this](size_type index, size_type offset) {
+        auto func = [this](size_type index, size_type offset)
+        {
             m_it.step_back(index, offset);
         };
         common_step_backward(dim, n, func);
@@ -2049,14 +2117,20 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::reset(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset) { m_it.step_back(index, offset); };
+        auto func = [this](size_type index, size_type offset)
+        {
+            m_it.step_back(index, offset);
+        };
         common_reset(dim, func, false);
     }
 
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::reset_back(size_type dim)
     {
-        auto func = [this](size_type index, size_type offset) { m_it.step(index, offset); };
+        auto func = [this](size_type index, size_type offset)
+        {
+            m_it.step(index, offset);
+        };
         common_reset(dim, func, true);
     }
 
@@ -2084,10 +2158,12 @@ namespace xt
     template <bool is_const, class CT, class... S>
     inline void xview_stepper<is_const, CT, S...>::to_end_impl(layout_type l)
     {
-        auto func = [](const auto& s) noexcept {
+        auto func = [](const auto& s) noexcept
+        {
             return xt::value(s, get_size(s) - 1);
         };
-        auto size_func = [](const auto& s) noexcept {
+        auto size_func = [](const auto& s) noexcept
+        {
             return get_size(s);
         };
 
@@ -2137,14 +2213,15 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, this](const auto& s) noexcept {
+            auto func = [&dim, this](const auto& s) noexcept
+            {
                 return step_size(s, this->m_index_keeper[dim]++, 1);
             };
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ?
-                    apply<size_type>(index, func, p_view->slices()) : 1;
+                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
+                                                           : 1;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2157,7 +2234,8 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, &n, this](const auto& s) noexcept {
+            auto func = [&dim, &n, this](const auto& s) noexcept
+            {
                 auto st_size = step_size(s, this->m_index_keeper[dim], n);
                 this->m_index_keeper[dim] += n;
                 return size_type(st_size);
@@ -2166,8 +2244,8 @@ namespace xt
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ?
-                    apply<size_type>(index, func, p_view->slices()) : n;
+                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
+                                                           : n;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2180,15 +2258,16 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, this](const auto& s) noexcept {
+            auto func = [&dim, this](const auto& s) noexcept
+            {
                 this->m_index_keeper[dim]--;
                 return step_size(s, this->m_index_keeper[dim], 1);
             };
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ?
-                    apply<size_type>(index, func, p_view->slices()) : 1;
+                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
+                                                           : 1;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2201,7 +2280,8 @@ namespace xt
     {
         if (dim >= m_offset)
         {
-            auto func = [&dim, &n, this](const auto& s) noexcept {
+            auto func = [&dim, &n, this](const auto& s) noexcept
+            {
                 this->m_index_keeper[dim] -= n;
                 return step_size(s, this->m_index_keeper[dim], n);
             };
@@ -2209,8 +2289,8 @@ namespace xt
             size_type index = integral_skip<S...>(dim);
             if (!is_newaxis_slice(index))
             {
-                size_type step_size = index < sizeof...(S) ?
-                    apply<size_type>(index, func, p_view->slices()) : n;
+                size_type step_size = index < sizeof...(S) ? apply<size_type>(index, func, p_view->slices())
+                                                           : n;
                 index -= newaxis_count_before<S...>(index);
                 f(index, step_size);
             }
@@ -2221,19 +2301,27 @@ namespace xt
     template <class F>
     void xview_stepper<is_const, CT, S...>::common_reset(size_type dim, F f, bool backwards)
     {
-        auto size_func = [](const auto& s) noexcept { return get_size(s); };
-        auto end_func = [](const auto& s) noexcept { return xt::value(s, get_size(s) - 1) - xt::value(s, 0); };
+        auto size_func = [](const auto& s) noexcept
+        {
+            return get_size(s);
+        };
+        auto end_func = [](const auto& s) noexcept
+        {
+            return xt::value(s, get_size(s) - 1) - xt::value(s, 0);
+        };
 
         size_type index = integral_skip<S...>(dim);
         if (!is_newaxis_slice(index))
         {
             if (dim < m_index_keeper.size())
             {
-                size_type size = index < sizeof...(S) ? apply<size_type>(index, size_func, p_view->slices()) : p_view->shape()[dim];
+                size_type size = index < sizeof...(S) ? apply<size_type>(index, size_func, p_view->slices())
+                                                      : p_view->shape()[dim];
                 m_index_keeper[dim] = backwards ? size - 1 : 0;
             }
 
-            size_type reset_n = index < sizeof...(S) ? apply<size_type>(index, end_func, p_view->slices()) : p_view->shape()[dim] - 1;
+            size_type reset_n = index < sizeof...(S) ? apply<size_type>(index, end_func, p_view->slices())
+                                                     : p_view->shape()[dim] - 1;
             index -= newaxis_count_before<S...>(index);
             f(index, reset_n);
         }

From aa9d078e2a53c3d00aa0fd2a88748ff6fca63dd3 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Wed, 1 Feb 2023 11:04:52 +0100
Subject: [PATCH 10/33] Try at fixed strided loop assignment

---
 include/xtensor/xassign.hpp | 395 ++++++++++++++++++++++++------------
 1 file changed, 269 insertions(+), 126 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 87dc9e3c5..5b7e513ac 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1,16 +1,17 @@
 /***************************************************************************
-* Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
-* Copyright (c) QuantStack                                                 *
-*                                                                          *
-* Distributed under the terms of the BSD 3-Clause License.                 *
-*                                                                          *
-* The full license is in the file LICENSE, distributed with this software. *
-****************************************************************************/
+ * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
+ * Copyright (c) QuantStack                                                 *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
 
 #ifndef XTENSOR_ASSIGN_HPP
 #define XTENSOR_ASSIGN_HPP
 
 #include <algorithm>
+#include <functional>
 #include <type_traits>
 #include <utility>
 
@@ -18,12 +19,12 @@
 #include <xtl/xsequence.hpp>
 
 #include "xexpression.hpp"
+#include "xfunction.hpp"
 #include "xiterator.hpp"
 #include "xstrides.hpp"
 #include "xtensor_config.hpp"
 #include "xtensor_forward.hpp"
 #include "xutils.hpp"
-#include "xfunction.hpp"
 
 #if defined(XTENSOR_USE_TBB)
 #include <tbb/tbb.h>
@@ -99,7 +100,6 @@ namespace xt
 
         template <class E1, class F, class... CT>
         static bool resize(E1& e1, const xfunction<F, CT...>& e2);
-
     };
 
     /********************
@@ -195,14 +195,18 @@ namespace xt
     template <class E1, class E2>
     inline void assign_xexpression(xexpression<E1>& e1, const xexpression<E2>& e2)
     {
-        xtl::mpl::static_if<has_assign_to<E1, E2>::value>([&](auto self)
-        {
-            self(e2).derived_cast().assign_to(e1);
-        }, /*else*/ [&](auto /*self*/)
-        {
-            using tag = xexpression_tag_t<E1, E2>;
-            xexpression_assigner<tag>::assign_xexpression(e1, e2);
-        });
+        xtl::mpl::static_if<has_assign_to<E1, E2>::value>(
+            [&](auto self)
+            {
+                self(e2).derived_cast().assign_to(e1);
+            },
+            /*else*/
+            [&](auto /*self*/)
+            {
+                using tag = xexpression_tag_t<E1, E2>;
+                xexpression_assigner<tag>::assign_xexpression(e1, e2);
+            }
+        );
     }
 
     template <class E1, class E2>
@@ -238,12 +242,16 @@ namespace xt
             // A row_major or column_major container with a dimension <= 1 is computed as
             // layout any, leading to some performance improvements, for example when
             // assigning a col-major vector to a row-major vector etc
-            return compute_layout(select_layout<E1::static_layout, typename E1::shape_type>::value,
-                                  select_layout<E2::static_layout, typename E2::shape_type>::value) != layout_type::dynamic;
+            return compute_layout(
+                       select_layout<E1::static_layout, typename E1::shape_type>::value,
+                       select_layout<E2::static_layout, typename E2::shape_type>::value
+                   )
+                   != layout_type::dynamic;
         }
 
         template <class E1, class E2>
-        inline auto is_linear_assign(const E1& e1, const E2& e2) -> std::enable_if_t<has_strides<E1>::value, bool>
+        inline auto is_linear_assign(const E1& e1, const E2& e2)
+            -> std::enable_if_t<has_strides<E1>::value, bool>
         {
             return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>()) ||
                    (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
@@ -258,9 +266,8 @@ namespace xt
         template <class E1, class E2>
         inline bool linear_dynamic_layout(const E1& e1, const E2& e2)
         {
-            return e1.is_contiguous()
-                && e2.is_contiguous()
-                && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic;
+            return e1.is_contiguous() && e2.is_contiguous()
+                   && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic;
         }
 
         template <class E, class = void>
@@ -269,16 +276,20 @@ namespace xt
         };
 
         template <class E>
-        struct has_step_leading<E, void_t<decltype(std::declval<E>().step_leading())>>
-            : std::true_type
+        struct has_step_leading<E, void_t<decltype(std::declval<E>().step_leading())>> : std::true_type
         {
         };
 
         template <class T>
         struct use_strided_loop
         {
-            static constexpr bool stepper_deref() { return std::is_reference<typename T::stepper::reference>::value; }
-            static constexpr bool value = has_strides<T>::value && has_step_leading<typename T::stepper>::value && stepper_deref();
+            static constexpr bool stepper_deref()
+            {
+                return std::is_reference<typename T::stepper::reference>::value;
+            }
+
+            static constexpr bool value = has_strides<T>::value
+                                          && has_step_leading<typename T::stepper>::value && stepper_deref();
         };
 
         template <class T>
@@ -332,43 +343,89 @@ namespace xt
         template <class T>
         using is_bool = std::is_same<T, bool>;
 
-        static constexpr bool is_bool_conversion() { return is_bool<e2_value_type>::value && !is_bool<e1_value_type>::value; }
-        static constexpr bool contiguous_layout() { return E1::contiguous_layout && E2::contiguous_layout; }
-        static constexpr bool convertible_types() { return std::is_convertible<e2_value_type, e1_value_type>::value
-                                                        && !is_bool_conversion(); }
+        static constexpr bool is_bool_conversion()
+        {
+            return is_bool<e2_value_type>::value && !is_bool<e1_value_type>::value;
+        }
+
+        static constexpr bool contiguous_layout()
+        {
+            return E1::contiguous_layout && E2::contiguous_layout;
+        }
+
+        static constexpr bool convertible_types()
+        {
+            return std::is_convertible<e2_value_type, e1_value_type>::value && !is_bool_conversion();
+        }
 
-        static constexpr bool use_xsimd() { return xt_simd::simd_traits<int8_t>::size > 1; }
+        static constexpr bool use_xsimd()
+        {
+            return xt_simd::simd_traits<int8_t>::size > 1;
+        }
 
         template <class T>
-        static constexpr bool simd_size_impl() { return xt_simd::simd_traits<T>::size > 1 || (is_bool<T>::value && use_xsimd()); }
-        static constexpr bool simd_size() { return simd_size_impl<e1_value_type>() && simd_size_impl<e2_value_type>(); }
-        static constexpr bool simd_interface() { return has_simd_interface<E1, requested_value_type>() && 
-                                                        has_simd_interface<E2, requested_value_type>(); }
+        static constexpr bool simd_size_impl()
+        {
+            return xt_simd::simd_traits<T>::size > 1 || (is_bool<T>::value && use_xsimd());
+        }
+
+        static constexpr bool simd_size()
+        {
+            return simd_size_impl<e1_value_type>() && simd_size_impl<e2_value_type>();
+        }
+
+        static constexpr bool simd_interface()
+        {
+            return has_simd_interface<E1, requested_value_type>()
+                   && has_simd_interface<E2, requested_value_type>();
+        }
 
     public:
 
         // constexpr methods instead of constexpr data members avoid the need of definitions at namespace
         // scope of these data members (since they are odr-used).
 
-        static constexpr bool simd_assign() { return convertible_types() && simd_size() && simd_interface(); }
-        static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial) { return trivial && detail::is_linear_assign(e1, e2); }
-        static constexpr bool strided_assign() { return detail::use_strided_loop<E1>::value && detail::use_strided_loop<E2>::value; }
-        static constexpr bool simd_linear_assign() { return contiguous_layout() && simd_assign(); }
-        static constexpr bool simd_strided_assign() { return strided_assign() && simd_assign(); }
+        static constexpr bool simd_assign()
+        {
+            return convertible_types() && simd_size() && simd_interface();
+        }
+
+        static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial)
+        {
+            return trivial && detail::is_linear_assign(e1, e2);
+        }
+
+        static constexpr bool strided_assign()
+        {
+            return detail::use_strided_loop<E1>::value && detail::use_strided_loop<E2>::value;
+        }
+
+        static constexpr bool simd_linear_assign()
+        {
+            return contiguous_layout() && simd_assign();
+        }
 
-        static constexpr bool simd_linear_assign(const E1& e1, const E2& e2) { return simd_assign()
-                                                                                && detail::linear_dynamic_layout(e1, e2); }
+        static constexpr bool simd_strided_assign()
+        {
+            return strided_assign() && simd_assign();
+        }
 
-        using e2_requested_value_type = std::conditional_t<is_bool<e2_value_type>::value,
-                                                           typename E2::bool_load_type,
-                                                           e2_value_type>;
-        using requested_value_type = detail::conditional_promote_to_complex_t<e1_value_type,
-                                                                              e2_requested_value_type>;
+        static constexpr bool simd_linear_assign(const E1& e1, const E2& e2)
+        {
+            return simd_assign() && detail::linear_dynamic_layout(e1, e2);
+        }
 
+        using e2_requested_value_type = std::
+            conditional_t<is_bool<e2_value_type>::value, typename E2::bool_load_type, e2_value_type>;
+        using requested_value_type = detail::conditional_promote_to_complex_t<e1_value_type, e2_requested_value_type>;
     };
 
     template <class E1, class E2>
-    inline void xexpression_assigner_base<xtensor_expression_tag>::assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial)
+    inline void xexpression_assigner_base<xtensor_expression_tag>::assign_data(
+        xexpression<E1>& e1,
+        const xexpression<E2>& e2,
+        bool trivial
+    )
     {
         E1& de1 = e1.derived_cast();
         const E2& de2 = e2.derived_cast();
@@ -378,9 +435,10 @@ namespace xt
         constexpr bool simd_assign = traits::simd_assign();
         constexpr bool simd_linear_assign = traits::simd_linear_assign();
         constexpr bool simd_strided_assign = traits::simd_strided_assign();
+        constexpr bool strided_assign = traits::strided_assign();
         if (linear_assign)
         {
-            if(simd_linear_assign || traits::simd_linear_assign(de1, de2))
+            if (simd_linear_assign || traits::simd_linear_assign(de1, de2))
             {
                 // Do not use linear_assigner<true> here since it will make the compiler
                 // instantiate this branch even if the runtime condition is false, resulting
@@ -394,7 +452,7 @@ namespace xt
                 linear_assigner<false>::run(de1, de2);
             }
         }
-        else if (simd_strided_assign && de1.layout() == de2.layout())
+        else if (simd_strided_assign)
         {
             strided_loop_assigner<simd_strided_assign>::run(de1, de2);
         }
@@ -418,16 +476,27 @@ namespace xt
     inline void xexpression_assigner<Tag>::computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2)
     {
         using shape_type = typename E1::shape_type;
+        using comperator_type = std::greater<typename shape_type::value_type>;
+
         using size_type = typename E1::size_type;
 
         E1& de1 = e1.derived_cast();
         const E2& de2 = e2.derived_cast();
 
-        size_type dim = de2.dimension();
-        shape_type shape = uninitialized_shape<shape_type>(dim);
+        size_type dim2 = de2.dimension();
+        shape_type shape = uninitialized_shape<shape_type>(dim2);
+
         bool trivial_broadcast = de2.broadcast_shape(shape, true);
 
-        if (dim > de1.dimension() || shape > de1.shape())
+        auto&& de1_shape = de1.shape();
+        if (dim2 > de1.dimension()
+            || std::lexicographical_compare(
+                shape.begin(),
+                shape.end(),
+                de1_shape.begin(),
+                de1_shape.end(),
+                comperator_type()
+            ))
         {
             typename E1::temporary_type tmp(shape);
             base_type::assign_data(tmp, e2, trivial_broadcast);
@@ -455,7 +524,8 @@ namespace xt
 
     template <class Tag>
     template <class E1, class E2>
-    inline void xexpression_assigner<Tag>::assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
+    inline void
+    xexpression_assigner<Tag>::assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
     {
         const E1& de1 = e1.derived_cast();
         const E2& de2 = e2.derived_cast();
@@ -499,16 +569,20 @@ namespace xt
     inline bool xexpression_assigner<Tag>::resize(E1& e1, const xfunction<F, CT...>& e2)
     {
         return xtl::mpl::static_if<detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value>(
-            [&](auto /*self*/) {
+            [&](auto /*self*/)
+            {
                 /*
                  * If the shape of the xfunction is statically known, we can compute the broadcast triviality
                  * at compile time plus we can resize right away.
                  */
                 // resize in case LHS is not a fixed size container. If it is, this is a NOP
                 e1.resize(typename xfunction<F, CT...>::shape_type{});
-                return detail::static_trivial_broadcast<detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value, CT...>::value;
+                return detail::static_trivial_broadcast<
+                    detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value,
+                    CT...>::value;
             },
-            /* else */ [&](auto /*self*/)
+            /* else */
+            [&](auto /*self*/)
             {
                 using index_type = xindex_type_t<typename E1::shape_type>;
                 using size_type = typename E1::size_type;
@@ -531,9 +605,10 @@ namespace xt
         using argument_type = std::decay_t<FROM>;
         using result_type = std::decay_t<TO>;
 
-        static const bool value = xtl::is_arithmetic<result_type>::value &&
-            (sizeof(result_type) < sizeof(argument_type) ||
-             (xtl::is_integral<result_type>::value && std::is_floating_point<argument_type>::value));
+        static const bool value = xtl::is_arithmetic<result_type>::value
+                                  && (sizeof(result_type) < sizeof(argument_type)
+                                      || (xtl::is_integral<result_type>::value
+                                          && std::is_floating_point<argument_type>::value));
     };
 
     template <class FROM, class TO>
@@ -551,15 +626,16 @@ namespace xt
         using argument_type = std::decay_t<FROM>;
         using result_type = std::decay_t<TO>;
 
-        static const bool value = is_narrowing_conversion<argument_type, result_type>::value ||
-                                  has_sign_conversion<argument_type, result_type>::value;
+        static const bool value = is_narrowing_conversion<argument_type, result_type>::value
+                                  || has_sign_conversion<argument_type, result_type>::value;
     };
 
     template <class E1, class E2, layout_type L>
     inline stepper_assigner<E1, E2, L>::stepper_assigner(E1& e1, const E2& e2)
-        : m_e1(e1), m_lhs(e1.stepper_begin(e1.shape())),
-          m_rhs(e2.stepper_begin(e1.shape())),
-          m_index(xtl::make_sequence<index_type>(e1.shape().size(), size_type(0)))
+        : m_e1(e1)
+        , m_lhs(e1.stepper_begin(e1.shape()))
+        , m_rhs(e2.stepper_begin(e1.shape()))
+        , m_index(xtl::make_sequence<index_type>(e1.shape().size(), size_type(0)))
     {
     }
 
@@ -637,10 +713,29 @@ namespace xt
         }
 
 #if defined(XTENSOR_USE_TBB)
-        tbb::parallel_for(align_begin, align_end, simd_size, [&e1, &e2](size_t i)
+        if (size >= XTENSOR_TBB_THRESHOLD)
         {
-            e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
-        });
+            static tbb::affinity_partitioner ap;
+            tbb::parallel_for(
+                align_begin,
+                align_end,
+                simd_size,
+                [&e1, &e2](size_t i)
+                {
+                    e1.template store_simd<lhs_align_mode>(
+                        i,
+                        e2.template load_simd<rhs_align_mode, value_type>(i)
+                    );
+                }, ap
+            );
+        }
+        else
+        {
+            for (size_type i = align_begin; i < align_end; i += simd_size)
+            {
+                e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
+            }
+        }
 #elif defined(XTENSOR_USE_OPENMP)
         if (size >= size_type(XTENSOR_OPENMP_TRESHOLD))
         {
@@ -651,8 +746,8 @@ namespace xt
                 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
             }
 #else
-            for(auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
-                i += static_cast<std::ptrdiff_t>(simd_size))
+            for (auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
+                 i += static_cast<std::ptrdiff_t>(simd_size))
             {
                 size_type ui = static_cast<size_type>(i);
                 e1.template store_simd<lhs_align_mode>(ui, e2.template load_simd<rhs_align_mode, value_type>(ui));
@@ -681,8 +776,8 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<false>::run(E1& e1, const E2& e2)
     {
-        using is_convertible = std::is_convertible<typename std::decay_t<E2>::value_type,
-                                                   typename std::decay_t<E1>::value_type>;
+        using is_convertible = std::
+            is_convertible<typename std::decay_t<E2>::value_type, typename std::decay_t<E1>::value_type>;
         // If the types are not compatible, this function is still instantiated but never called.
         // To avoid compilation problems in effectively unused code trivial_assigner_run_impl is
         // empty in this case.
@@ -699,10 +794,14 @@ namespace xt
         size_type n = e1.size();
 // 		std::cout << "Linear assigner LOOP SIZE is " << n <<std::endl;
 #if defined(XTENSOR_USE_TBB)
-        tbb::parallel_for(std::ptrdiff_t(0), static_cast<std::ptrdiff_t>(n), [&](std::ptrdiff_t i)
-        {
-            *(dst + i) = static_cast<value_type>(*(src + i));
-        });
+        tbb::parallel_for(
+            std::ptrdiff_t(0),
+            static_cast<std::ptrdiff_t>(n),
+            [&](std::ptrdiff_t i)
+            {
+                *(dst + i) = static_cast<value_type>(*(src + i));
+            }
+        );
 #elif defined(XTENSOR_USE_OPENMP)
         if (n >= XTENSOR_OPENMP_TRESHOLD)
         {
@@ -733,9 +832,7 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<false>::run_impl(E1&, const E2&, std::false_type /*is_convertible*/)
     {
-        XTENSOR_PRECONDITION(false,
-            "Internal error: linear_assigner called with unrelated types.");
-
+        XTENSOR_PRECONDITION(false, "Internal error: linear_assigner called with unrelated types.");
     }
 
     /****************************************
@@ -851,14 +948,13 @@ namespace xt
             using strides_type = S;
 
             check_strides_functor(const S& strides)
-                : m_cut(L == layout_type::row_major ? 0 : strides.size()),
-                  m_strides(strides)
+                : m_cut(L == layout_type::row_major ? 0 : strides.size())
+                , m_strides(strides)
             {
             }
 
             template <class T, layout_type LE = L>
-            std::enable_if_t<LE == layout_type::row_major, std::size_t>
-            operator()(const T& el)
+            std::enable_if_t<LE == layout_type::row_major, std::size_t> operator()(const T& el)
             {
                 auto var = check_strides_overlap<layout_type::row_major>::get(m_strides, el.strides());
                 if (var > m_cut)
@@ -869,8 +965,7 @@ namespace xt
             }
 
             template <class T, layout_type LE = L>
-            std::enable_if_t<LE == layout_type::column_major, std::size_t>
-            operator()(const T& el)
+            std::enable_if_t<LE == layout_type::column_major, std::size_t> operator()(const T& el)
             {
                 auto var = check_strides_overlap<layout_type::column_major>::get(m_strides, el.strides());
                 if (var < m_cut)
@@ -913,18 +1008,25 @@ namespace xt
             }
             else if (E1::static_layout == layout_type::column_major || !is_row_major)
             {
-                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides());
+                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
+                );
                 cut = csf(e2);
 				if (cut>1) cut = 1;
             } // can't reach here because this would have already triggered the fallback
 
             using shape_value_type = typename E1::shape_type::value_type;
-            std::size_t outer_loop_size = static_cast<std::size_t>(
-                            std::accumulate(e1.shape().begin(), e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
-                                shape_value_type(1), std::multiplies<shape_value_type>{}));
-            std::size_t inner_loop_size = static_cast<std::size_t>(
-                            std::accumulate(e1.shape().begin() + static_cast<std::ptrdiff_t>(cut), e1.shape().end(),
-                                shape_value_type(1), std::multiplies<shape_value_type>{}));
+            std::size_t outer_loop_size = static_cast<std::size_t>(std::accumulate(
+                e1.shape().begin(),
+                e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
+                shape_value_type(1),
+                std::multiplies<shape_value_type>{}
+            ));
+            std::size_t inner_loop_size = static_cast<std::size_t>(std::accumulate(
+                e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
+                e1.shape().end(),
+                shape_value_type(1),
+                std::multiplies<shape_value_type>{}
+            ));
 
             if (E1::static_layout == layout_type::column_major || !is_row_major)
             {
@@ -942,37 +1044,30 @@ namespace xt
     {
         bool is_row_major = true;
         using fallback_assigner = stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>;
-        if (E1::static_layout == layout_type::dynamic)
-        {
-            layout_type dynamic_layout = e1.layout();
 
-            switch (dynamic_layout)
-            {
-                case layout_type::row_major:
-                    is_row_major = true;
-                    break;
-                case layout_type::column_major:
-                    is_row_major = false;
-                    break;
-                default:
-                    return fallback_assigner(e1, e2).run();
-            }
-        }
-        else if (E1::static_layout == layout_type::row_major)
-        {
+        bool mismatch_colrow = (e1.layout() == layout_type::row_major && e2.layout() == layout_type::column_major)
+                               || (e2.layout() == layout_type::row_major && e1.layout() == layout_type::column_major);
+        if (mismatch_colrow)
+            return fallback_assigner(e1, e2).run();
+
+        bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1;
+        bool de1_col_contiguous = e1.strides()[0] == 1;
+        bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1;
+        bool de2_col_contiguous = e2.strides()[0] == 1;
+
+        if (de1_row_contiguous && de2_row_contiguous)
             is_row_major = true;
-        }
-        else if (E1::static_layout == layout_type::column_major)
-        {
+        else if (de1_col_contiguous && de2_col_contiguous)
             is_row_major = false;
-        }
         else
-        {
-            XTENSOR_THROW(std::runtime_error, "Illegal layout set (layout_type::any?).");
-        }
+            return fallback_assigner(e1, e2).run();
 
         std::size_t inner_loop_size, outer_loop_size, cut;
-        std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(e1, e2, is_row_major);
+        std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(
+            e1,
+            e2,
+            is_row_major
+        );
 
         if ((is_row_major && cut == e1.dimension()) || (!is_row_major && cut == 0))
         {
@@ -999,9 +1094,10 @@ namespace xt
         using e2_value_type = typename E2::value_type;
         constexpr bool needs_cast = has_assign_conversion<e1_value_type, e2_value_type>::value;
         using value_type = typename xassign_traits<E1, E2>::requested_value_type;
-        using simd_type = std::conditional_t<std::is_same<e1_value_type, bool>::value,
-                                             xt_simd::simd_bool_type<value_type>,
-                                             xt_simd::simd_type<value_type>>;
+        using simd_type = std::conditional_t<
+            std::is_same<e1_value_type, bool>::value,
+            xt_simd::simd_bool_type<value_type>,
+            xt_simd::simd_type<value_type>>;
 
         std::size_t simd_size = inner_loop_size / simd_type::size;
         std::size_t simd_rest = inner_loop_size % simd_type::size;
@@ -1012,7 +1108,7 @@ namespace xt
         // TODO in 1D case this is ambigous -- could be RM or CM.
         //      Use default layout to make decision
         std::size_t step_dim = 0;
-        if (!is_row_major) // row major case
+        if (!is_row_major)  // row major case
         {
             step_dim = cut;
         }
@@ -1063,7 +1159,54 @@ namespace xt
 			}
 		}
 		else {
-		
+#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
+        if (outer_loop_size > 20) {
+          tbb::parallel_for(0ul, outer_loop_size,
+                          [&, fct_stepper, res_stepper, idx](tbb::blocked_range<size_t> const &r) {
+                              std::size_t first_step = true;
+//#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
+                              for (std::size_t ox = r.begin(); ox < r.end(); ++ox) {
+                                  if (first_step) {
+                                      is_row_major ?
+                                      strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx,
+                                                                                                        max_shape) :
+                                      strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox, idx,
+                                                                                                           max_shape);
+
+                                      for (std::size_t i = 0; i < idx.size(); ++i) {
+                                          fct_stepper.step(i + step_dim, idx[i]);
+                                          res_stepper.step(i + step_dim, idx[i]);
+                                      }
+                                      first_step = false;
+                                  }
+
+                                  for (std::size_t i = 0; i < simd_size; ++i) {
+                                      res_stepper.template store_simd<simd_type>(
+                                              fct_stepper.template step_simd<value_type>());
+                                  }
+                                  for (std::size_t i = 0; i < simd_rest; ++i) {
+                                      *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+                                      res_stepper.step_leading();
+                                      fct_stepper.step_leading();
+                                  }
+
+                                  // next unaligned index
+                                  is_row_major ?
+                                  strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
+                                  strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+
+                                  // need to step E1 as well if not contigous assign (e.g. view)
+                                  fct_stepper.to_begin();
+                                  res_stepper.to_begin();
+                                  for (std::size_t i = 0; i < idx.size(); ++i) {
+                                      fct_stepper.step(i + step_dim, idx[i]);
+                                      res_stepper.step(i + step_dim, idx[i]);
+                                  }
+                              }
+                          });
+    }
+		else {
+
 #endif
         for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
         {
@@ -1078,9 +1221,9 @@ namespace xt
                 fct_stepper.step_leading();
             }
 
-            is_row_major ?
-                strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
-                strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+            is_row_major
+                ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
+                : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
 
             fct_stepper.to_begin();
 

From 68ba3def8a9457fe70e923734f9f5a276b86caee Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Wed, 1 Feb 2023 11:42:14 +0100
Subject: [PATCH 11/33] Compiles and maybe works now..

---
 include/xtensor/xassign.hpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 5b7e513ac..b82c04268 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1052,12 +1052,12 @@ namespace xt
 
         bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1;
         bool de1_col_contiguous = e1.strides()[0] == 1;
-        bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1;
-        bool de2_col_contiguous = e2.strides()[0] == 1;
+        //bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1;
+        //bool de2_col_contiguous = e2.strides()[0] == 1;
 
-        if (de1_row_contiguous && de2_row_contiguous)
+        if (de1_row_contiguous)// && de2_row_contiguous)
             is_row_major = true;
-        else if (de1_col_contiguous && de2_col_contiguous)
+        else if (de1_col_contiguous)// && de2_col_contiguous)
             is_row_major = false;
         else
             return fallback_assigner(e1, e2).run();
@@ -1161,8 +1161,11 @@ namespace xt
 		else {
 #elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
         if (outer_loop_size > 20) {
-          tbb::parallel_for(0ul, outer_loop_size,
-                          [&, fct_stepper, res_stepper, idx](tbb::blocked_range<size_t> const &r) {
+          tbb::parallel_for(tbb::blocked_range<size_t>(0ul, outer_loop_size),
+                          [is_row_major, step_dim, simd_size, simd_rest, &max_shape, &fct_stepper_=fct_stepper, &res_stepper_=res_stepper, &idx_=idx](tbb::blocked_range<size_t> const &r) {
+                              auto idx = idx_;
+                              auto res_stepper = res_stepper_;
+                              auto fct_stepper = fct_stepper_;
                               std::size_t first_step = true;
 //#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
                               for (std::size_t ox = r.begin(); ox < r.end(); ++ox) {
@@ -1245,7 +1248,7 @@ namespace xt
                 }
             }
         }
-#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
+#if (defined(XTENSOR_USE_OPENMP) || defined(XTENSOR_USE_TBB)) && defined(strided_parallel_assign)
 		}
 #endif
 	}

From 0a62ded224c3f3f821192c5157bb6be390d299e0 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Wed, 1 Feb 2023 14:42:41 +0100
Subject: [PATCH 12/33] Use static partitioner instead

---
 include/xtensor/xassign.hpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index b82c04268..f672d244e 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -715,7 +715,7 @@ namespace xt
 #if defined(XTENSOR_USE_TBB)
         if (size >= XTENSOR_TBB_THRESHOLD)
         {
-            static tbb::affinity_partitioner ap;
+            tbb::static_partitioner ap;
             tbb::parallel_for(
                 align_begin,
                 align_end,
@@ -794,14 +794,14 @@ namespace xt
         size_type n = e1.size();
 // 		std::cout << "Linear assigner LOOP SIZE is " << n <<std::endl;
 #if defined(XTENSOR_USE_TBB)
+        tbb::static_partitioner sp;
         tbb::parallel_for(
             std::ptrdiff_t(0),
             static_cast<std::ptrdiff_t>(n),
             [&](std::ptrdiff_t i)
             {
                 *(dst + i) = static_cast<value_type>(*(src + i));
-            }
-        );
+            }, sp);
 #elif defined(XTENSOR_USE_OPENMP)
         if (n >= XTENSOR_OPENMP_TRESHOLD)
         {
@@ -1161,6 +1161,7 @@ namespace xt
 		else {
 #elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
         if (outer_loop_size > 20) {
+          tbb::static_partitioner sp;
           tbb::parallel_for(tbb::blocked_range<size_t>(0ul, outer_loop_size),
                           [is_row_major, step_dim, simd_size, simd_rest, &max_shape, &fct_stepper_=fct_stepper, &res_stepper_=res_stepper, &idx_=idx](tbb::blocked_range<size_t> const &r) {
                               auto idx = idx_;
@@ -1206,7 +1207,7 @@ namespace xt
                                       res_stepper.step(i + step_dim, idx[i]);
                                   }
                               }
-                          });
+                          }, sp);
     }
 		else {
 

From 3c4e93a148cdacd01af5b9919a027f10300ad230 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Wed, 1 Feb 2023 14:52:37 +0100
Subject: [PATCH 13/33] Remove unused variable

---
 include/xtensor/xassign.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index f672d244e..db3d9435a 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -435,7 +435,6 @@ namespace xt
         constexpr bool simd_assign = traits::simd_assign();
         constexpr bool simd_linear_assign = traits::simd_linear_assign();
         constexpr bool simd_strided_assign = traits::simd_strided_assign();
-        constexpr bool strided_assign = traits::strided_assign();
         if (linear_assign)
         {
             if (simd_linear_assign || traits::simd_linear_assign(de1, de2))

From d1209f2863f095c7f18af6351ece858891cf07b6 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Wed, 1 Feb 2023 15:21:29 +0100
Subject: [PATCH 14/33] Make explicitly the steppers in teh lambda instead of a
 copy

---
 include/xtensor/xassign.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index db3d9435a..5be9f947c 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1162,10 +1162,10 @@ namespace xt
         if (outer_loop_size > 20) {
           tbb::static_partitioner sp;
           tbb::parallel_for(tbb::blocked_range<size_t>(0ul, outer_loop_size),
-                          [is_row_major, step_dim, simd_size, simd_rest, &max_shape, &fct_stepper_=fct_stepper, &res_stepper_=res_stepper, &idx_=idx](tbb::blocked_range<size_t> const &r) {
+                          [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_=idx](tbb::blocked_range<size_t> const &r) {
                               auto idx = idx_;
-                              auto res_stepper = res_stepper_;
-                              auto fct_stepper = fct_stepper_;
+                              auto fct_stepper = e2.stepper_begin(e1.shape());
+                              auto res_stepper = e1.stepper_begin(e1.shape());
                               std::size_t first_step = true;
 //#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
                               for (std::size_t ox = r.begin(); ox < r.end(); ++ox) {

From 01c8b4c00d9d57c94c8a37373ab187b20e123e99 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Thu, 2 Feb 2023 20:12:17 +0100
Subject: [PATCH 15/33] Fix the strided assigner

---
 include/xtensor/xassign.hpp | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 5be9f947c..e6a43580a 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -955,6 +955,7 @@ namespace xt
             template <class T, layout_type LE = L>
             std::enable_if_t<LE == layout_type::row_major, std::size_t> operator()(const T& el)
             {
+                // All dimenions less than var have differing strides
                 auto var = check_strides_overlap<layout_type::row_major>::get(m_strides, el.strides());
                 if (var > m_cut)
                 {
@@ -967,6 +968,7 @@ namespace xt
             std::enable_if_t<LE == layout_type::column_major, std::size_t> operator()(const T& el)
             {
                 auto var = check_strides_overlap<layout_type::column_major>::get(m_strides, el.strides());
+                // All dimensions >= var have differing strides
                 if (var < m_cut)
                 {
                     m_cut = var;
@@ -996,16 +998,18 @@ namespace xt
         template <class E1, class E2>
         auto get_loop_sizes(const E1& e1, const E2& e2, bool is_row_major)
         {
+            // Cut is the number of dimensions in the outer loop
             std::size_t cut = 0;
 
             // TODO! if E1 is !contiguous --> initialize cut to sensible value!
-            if (E1::static_layout == layout_type::row_major || is_row_major)
+            if (is_row_major)
             {
                 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
+                // This makes that only one dimension will be treated in the inner loop.
 				if (cut<e1.strides().size()-1) cut = e1.strides().size()-1;
             }
-            else if (E1::static_layout == layout_type::column_major || !is_row_major)
+            else if (!is_row_major)
             {
                 auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
                 );
@@ -1027,7 +1031,7 @@ namespace xt
                 std::multiplies<shape_value_type>{}
             ));
 
-            if (E1::static_layout == layout_type::column_major || !is_row_major)
+            if (!is_row_major)
             {
                 std::swap(outer_loop_size, inner_loop_size);
             }
@@ -1044,23 +1048,21 @@ namespace xt
         bool is_row_major = true;
         using fallback_assigner = stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>;
 
-        bool mismatch_colrow = (e1.layout() == layout_type::row_major && e2.layout() == layout_type::column_major)
-                               || (e2.layout() == layout_type::row_major && e1.layout() == layout_type::column_major);
-        if (mismatch_colrow)
-            return fallback_assigner(e1, e2).run();
-
+        // Try to find a row-major scheme first, where the outer loop is on the first N = `cut` dimensions,
+        // and the inner loop is
+        is_row_major = true;
         bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1;
         bool de1_col_contiguous = e1.strides()[0] == 1;
-        //bool de2_row_contiguous = e2.strides()[e2.strides().size()-1] == 1;
-        //bool de2_col_contiguous = e2.strides()[0] == 1;
-
-        if (de1_row_contiguous)// && de2_row_contiguous)
+        if (de1_row_contiguous)
             is_row_major = true;
-        else if (de1_col_contiguous)// && de2_col_contiguous)
+        else if (de1_col_contiguous) // If first stride is 1, and we don't have a one-dimensional container
             is_row_major = false;
         else
-            return fallback_assigner(e1, e2).run();
+            // We might be dealing with a broadcasted dimension at the end.
+            // Since this is not detected explicitly, just try row-major by default
+            is_row_major = true;
 
+        // cut is the number of dimensions that the outer loop takes care of.
         std::size_t inner_loop_size, outer_loop_size, cut;
         std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(
             e1,

From e4558a630a7fba31b7652bab470f26849595c7f0 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Thu, 23 Feb 2023 17:51:18 +0100
Subject: [PATCH 16/33] Add a test, and fix a few bugs for the strided
 (parallel) assigner.

---
 include/xtensor/xassign.hpp    | 518 +++++++++++++++++++--------------
 include/xtensor/xcontainer.hpp |  21 +-
 test/CMakeLists.txt            |   1 +
 test/test_strided_assign.cpp   | 114 ++++++++
 4 files changed, 426 insertions(+), 228 deletions(-)
 create mode 100644 test/test_strided_assign.cpp

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index e6a43580a..4a68827b6 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -172,13 +172,29 @@ namespace xt
      * strided_loop_assigner *
      *************************/
 
+    namespace strided_assign_detail {
+        struct loop_sizes_t {
+            bool is_row_major;
+            std::size_t inner_loop_size;
+            std::size_t outer_loop_size;
+            std::size_t cut;
+            std::size_t dimension;
+            inline bool can_do_strided_assign() { // Is it actually useful to do a strided assign?
+                return inner_loop_size > 1;
+                //return !((is_row_major && cut == dimension) || (!is_row_major && cut == 0)) && inner_loop_size != 1;
+            };
+        };
+    }
     template <bool simd>
     class strided_loop_assigner
     {
     public:
-
+        using loop_sizes_t = strided_assign_detail::loop_sizes_t;
+        // is_row_major, inner_loop_size, outer_loop_size, cut
         template <class E1, class E2>
-        static void run(E1& e1, const E2& e2);
+        static void run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes);
+        template <class E1, class E2>
+        static loop_sizes_t get_loop_sizes(E1& e1, const E2& e2);
     };
 
     /***********************************
@@ -253,8 +269,8 @@ namespace xt
         inline auto is_linear_assign(const E1& e1, const E2& e2)
             -> std::enable_if_t<has_strides<E1>::value, bool>
         {
-            return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>()) ||
-                   (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
+            return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>())
+                   || (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
         }
 
         template <class E1, class E2>
@@ -451,14 +467,20 @@ namespace xt
                 linear_assigner<false>::run(de1, de2);
             }
         }
-        else if (simd_strided_assign)
-        {
-            strided_loop_assigner<simd_strided_assign>::run(de1, de2);
-        }
         else
         {
-            stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)> assigner(de1, de2);
-            assigner.run();
+            if (simd_strided_assign)
+            {
+                strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner<simd_strided_assign>::get_loop_sizes(de1, de2);
+                if (loop_sizes.can_do_strided_assign())
+                    strided_loop_assigner<simd_strided_assign>::run(de1, de2, loop_sizes);
+                else
+                    stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(de1, de2).run();
+            }
+            else
+            {
+                stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(de1, de2).run();
+            }
         }
     }
 
@@ -690,7 +712,6 @@ namespace xt
     template <class E1, class E2>
     inline void linear_assigner<simd_assign>::run(E1& e1, const E2& e2)
     {
-// 		std::cout << "lin" << std::endl;
         using lhs_align_mode = xt_simd::container_alignment_t<E1>;
         constexpr bool is_aligned = std::is_same<lhs_align_mode, aligned_mode>::value;
         using rhs_align_mode = std::conditional_t<is_aligned, inner_aligned_mode, unaligned_mode>;
@@ -725,7 +746,8 @@ namespace xt
                         i,
                         e2.template load_simd<rhs_align_mode, value_type>(i)
                     );
-                }, ap
+                },
+                ap
             );
         }
         else
@@ -791,7 +813,6 @@ namespace xt
         auto src = linear_begin(e2);
         auto dst = linear_begin(e1);
         size_type n = e1.size();
-// 		std::cout << "Linear assigner LOOP SIZE is " << n <<std::endl;
 #if defined(XTENSOR_USE_TBB)
         tbb::static_partitioner sp;
         tbb::parallel_for(
@@ -800,23 +821,26 @@ namespace xt
             [&](std::ptrdiff_t i)
             {
                 *(dst + i) = static_cast<value_type>(*(src + i));
-            }, sp);
+            },
+            sp
+        );
 #elif defined(XTENSOR_USE_OPENMP)
         if (n >= XTENSOR_OPENMP_TRESHOLD)
         {
 #pragma omp parallel for default(none) shared(src, dst, n)
-            for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n) ; i++)
+            for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n); i++)
             {
                 *(dst + i) = static_cast<value_type>(*(src + i));
             }
         }
-        else {
+        else
+        {
             for (; n > size_type(0); --n)
             {
                 *dst = static_cast<value_type>(*src);
                 ++src;
                 ++dst;
-            }  
+            }
         }
 #else
         for (; n > size_type(0); --n)
@@ -863,34 +887,35 @@ namespace xt
                     }
                 }
             }
-                     
-			template <class T>
+
+            template <class T>
             static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
-			{
-				
-				dynamic_shape<std::size_t> stride_sizes;
-				xt::resize_container(stride_sizes, outer_shape.size());
-				// compute strides
-				using size_type = typename T::size_type;
-				for ( size_type i = outer_shape.size(); i>0; i--) {
-					stride_sizes[i-1] = (i==outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
-				}
-				
-				// compute index
-				for (size_type i=0; i < outer_shape.size(); i++)
+            {
+                dynamic_shape<std::size_t> stride_sizes;
+                xt::resize_container(stride_sizes, outer_shape.size());
+                // compute strides
+                using size_type = typename T::size_type;
+                for (size_type i = outer_shape.size(); i > 0; i--)
+                {
+                    stride_sizes[i - 1] = (i == outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
+                }
+
+                // compute index
+                for (size_type i = 0; i < outer_shape.size(); i++)
                 {
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx < outer_shape[i]) {
-						outer_index[i] = d_idx;
-						n -= d_idx * stride_sizes[i];
-					}
-					else {
-						// that may not happen
-						outer_index[i] = 0;
-					}
+                    auto d_idx = n / stride_sizes[i];
+                    if (d_idx < outer_shape[i])
+                    {
+                        outer_index[i] = d_idx;
+                        n -= d_idx * stride_sizes[i];
+                    }
+                    else
+                    {
+                        // that may not happen
+                        outer_index[i] = 0;
+                    }
                 }
-			}
-		
+            }
         };
 
         template <>
@@ -915,30 +940,34 @@ namespace xt
                     }
                 }
             }
-            
-			template <class T>
+
+            template <class T>
             static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
-			{
-				dynamic_shape<std::size_t> stride_sizes;
-				xt::resize_container(stride_sizes, outer_shape.size());
-				
-				using size_type = typename T::size_type;
-				
-				// compute required strides
-				for (size_type i = 0; i < outer_shape.size(); i++) { stride_sizes[i] = (i==0) ? 1 : stride_sizes[i-1] * outer_shape[i-1]; }
-				
-				// compute index
-				for (size_type i = outer_shape.size(); i>0;)
-				{
-					i--;
-					auto d_idx = n/stride_sizes[i];
-					if (d_idx<(outer_shape[i]-1)) {
-						outer_index[i] = d_idx;
-						n -= d_idx * stride_sizes[i];
-					}
-// 					else {}   // that may not happen
+            {
+                dynamic_shape<std::size_t> stride_sizes;
+                xt::resize_container(stride_sizes, outer_shape.size());
+
+                using size_type = typename T::size_type;
+
+                // compute required strides
+                for (size_type i = 0; i < outer_shape.size(); i++)
+                {
+                    stride_sizes[i] = (i == 0) ? 1 : stride_sizes[i - 1] * outer_shape[i - 1];
+                }
+
+                // compute index
+                for (size_type i = outer_shape.size(); i > 0;)
+                {
+                    i--;
+                    auto d_idx = n / stride_sizes[i];
+                    if (d_idx < (outer_shape[i] - 1))
+                    {
+                        outer_index[i] = d_idx;
+                        n -= d_idx * stride_sizes[i];
+                    }
+                    // 					else {}   // that may not happen
                 }
-			}
+            }
         };
 
         template <layout_type L, class S>
@@ -996,28 +1025,59 @@ namespace xt
         };
 
         template <class E1, class E2>
-        auto get_loop_sizes(const E1& e1, const E2& e2, bool is_row_major)
+        loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2)
         {
+            using shape_value_type = typename E1::shape_type::value_type;
+            bool is_row_major = true;
+
+            // Try to find a row-major scheme first, where the outer loop is on the first N = `cut`
+            // dimensions, and the inner loop is
+            is_row_major = true;
+            auto is_zero = [](auto i){return i == 0;};
+            auto &&strides = e1.strides();
+            auto it_bwd = std::find_if_not(strides.rbegin(), strides.rend(), is_zero);
+            bool de1_row_contiguous = it_bwd != strides.rend() && *it_bwd == 1;
+            auto it_fwd = std::find_if_not(strides.begin(), strides.end(), is_zero);
+            bool de1_col_contiguous = it_fwd != strides.end() && *it_fwd == 1;
+            if (de1_row_contiguous)
+            {
+                is_row_major = true;
+            }
+            else if (de1_col_contiguous)
+            {
+                is_row_major = false;
+            }
+            else
+            {
+                // No strided loop possible.
+                return {true, 1, e1.size(), e1.dimension(), e1.dimension()};
+            }
+
             // Cut is the number of dimensions in the outer loop
             std::size_t cut = 0;
 
-            // TODO! if E1 is !contiguous --> initialize cut to sensible value!
             if (is_row_major)
             {
                 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
                 // This makes that only one dimension will be treated in the inner loop.
-				if (cut<e1.strides().size()-1) cut = e1.strides().size()-1;
+                if (cut < e1.strides().size() - 1)
+                {
+                    // Only make the inner loop go over one dimension by default for now
+                    cut = e1.strides().size() - 1;
+                }
             }
             else if (!is_row_major)
             {
-                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
-                );
+                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
-				if (cut>1) cut = 1;
-            } // can't reach here because this would have already triggered the fallback
+                if (cut > 1)
+                {
+                    // Only make the inner loop go over one dimension by default for now
+                    cut = 1;
+                }
+            }  // can't reach here because this would have already triggered the fallback
 
-            using shape_value_type = typename E1::shape_type::value_type;
             std::size_t outer_loop_size = static_cast<std::size_t>(std::accumulate(
                 e1.shape().begin(),
                 e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
@@ -1036,44 +1096,28 @@ namespace xt
                 std::swap(outer_loop_size, inner_loop_size);
             }
 
-            return std::make_tuple(inner_loop_size, outer_loop_size, cut);
+            return {is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()};
         }
     }
 
-#define strided_parallel_assign
     template <bool simd>
     template <class E1, class E2>
-    inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2)
+    inline strided_assign_detail::loop_sizes_t strided_loop_assigner<simd>::get_loop_sizes(E1& e1, const E2& e2)
     {
-        bool is_row_major = true;
-        using fallback_assigner = stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>;
-
-        // Try to find a row-major scheme first, where the outer loop is on the first N = `cut` dimensions,
-        // and the inner loop is
-        is_row_major = true;
-        bool de1_row_contiguous = e1.strides()[e1.strides().size()-1] == 1;
-        bool de1_col_contiguous = e1.strides()[0] == 1;
-        if (de1_row_contiguous)
-            is_row_major = true;
-        else if (de1_col_contiguous) // If first stride is 1, and we don't have a one-dimensional container
-            is_row_major = false;
-        else
-            // We might be dealing with a broadcasted dimension at the end.
-            // Since this is not detected explicitly, just try row-major by default
-            is_row_major = true;
+        return strided_assign_detail::get_loop_sizes(e1, e2);
+    }
 
-        // cut is the number of dimensions that the outer loop takes care of.
-        std::size_t inner_loop_size, outer_loop_size, cut;
-        std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(
-            e1,
-            e2,
-            is_row_major
-        );
+#define strided_parallel_assign
+
+    template <bool simd>
+    template <class E1, class E2>
+    inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes)
+    {
+        bool is_row_major = loop_sizes.is_row_major;
+        std::size_t inner_loop_size = loop_sizes.inner_loop_size;
+        std::size_t outer_loop_size = loop_sizes.outer_loop_size;
+        std::size_t cut = loop_sizes.cut;
 
-        if ((is_row_major && cut == e1.dimension()) || (!is_row_major && cut == 0))
-        {
-            return fallback_assigner(e1, e2).run();
-        }
 
         // TODO can we get rid of this and use `shape_type`?
         dynamic_shape<std::size_t> idx, max_shape;
@@ -1113,128 +1157,45 @@ namespace xt
         {
             step_dim = cut;
         }
-// 		std::cout << "OUTER LOOP size is " << outer_loop_size << " INNER LOOP size is " << inner_loop_size <<std::endl;
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-		if (outer_loop_size>20) {
-			std::size_t first_step = true;
-#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
-			for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
-			{
-				if (first_step) {
-					is_row_major ?
-						strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox,idx, max_shape) :
-						strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox,idx, max_shape);
-					
-					for (std::size_t i = 0; i < idx.size(); ++i)
-					{
-						fct_stepper.step(i + step_dim, idx[i]);
-						res_stepper.step(i + step_dim, idx[i]);
-					}
-					first_step = false;
-				}
-
-				for (std::size_t i = 0; i < simd_size; ++i)
-				{
-					res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
-				}
-				for (std::size_t i = 0; i < simd_rest; ++i)
-				{
-					*(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
-					res_stepper.step_leading();
-					fct_stepper.step_leading();
-				}
-				
-				// next unaligned index
-				is_row_major ?
-					strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
-					strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
-					
-				// need to step E1 as well if not contigous assign (e.g. view)
-				fct_stepper.to_begin();
-				res_stepper.to_begin();
-				for (std::size_t i = 0; i < idx.size(); ++i)
-				{
-					fct_stepper.step(i + step_dim, idx[i]);
-					res_stepper.step(i + step_dim, idx[i]);
-				}
-			}
-		}
-		else {
-#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
-        if (outer_loop_size > 20) {
-          tbb::static_partitioner sp;
-          tbb::parallel_for(tbb::blocked_range<size_t>(0ul, outer_loop_size),
-                          [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_=idx](tbb::blocked_range<size_t> const &r) {
-                              auto idx = idx_;
-                              auto fct_stepper = e2.stepper_begin(e1.shape());
-                              auto res_stepper = e1.stepper_begin(e1.shape());
-                              std::size_t first_step = true;
-//#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
-                              for (std::size_t ox = r.begin(); ox < r.end(); ++ox) {
-                                  if (first_step) {
-                                      is_row_major ?
-                                      strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx,
-                                                                                                        max_shape) :
-                                      strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox, idx,
-                                                                                                           max_shape);
-
-                                      for (std::size_t i = 0; i < idx.size(); ++i) {
-                                          fct_stepper.step(i + step_dim, idx[i]);
-                                          res_stepper.step(i + step_dim, idx[i]);
-                                      }
-                                      first_step = false;
-                                  }
-
-                                  for (std::size_t i = 0; i < simd_size; ++i) {
-                                      res_stepper.template store_simd<simd_type>(
-                                              fct_stepper.template step_simd<value_type>());
-                                  }
-                                  for (std::size_t i = 0; i < simd_rest; ++i) {
-                                      *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
-                                      res_stepper.step_leading();
-                                      fct_stepper.step_leading();
-                                  }
-
-                                  // next unaligned index
-                                  is_row_major ?
-                                  strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape) :
-                                  strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
-
-                                  // need to step E1 as well if not contigous assign (e.g. view)
-                                  fct_stepper.to_begin();
-                                  res_stepper.to_begin();
-                                  for (std::size_t i = 0; i < idx.size(); ++i) {
-                                      fct_stepper.step(i + step_dim, idx[i]);
-                                      res_stepper.step(i + step_dim, idx[i]);
-                                  }
-                              }
-                          }, sp);
-    }
-		else {
-
-#endif
-        for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
+        if (outer_loop_size > 20)
         {
-            for (std::size_t i = 0; i < simd_size; ++i)
-            {
-                res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
-            }
-            for (std::size_t i = 0; i < simd_rest; ++i)
+            std::size_t first_step = true;
+#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
+            for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
             {
-                *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
-                res_stepper.step_leading();
-                fct_stepper.step_leading();
-            }
+                if (first_step)
+                {
+                    is_row_major
+                        ? strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx, max_shape)
+                        : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox, idx, max_shape);
+
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                        res_stepper.step(i + step_dim, idx[i]);
+                    }
+                    first_step = false;
+                }
 
-            is_row_major
-                ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
-                : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+                for (std::size_t i = 0; i < simd_size; ++i)
+                {
+                    res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
+                }
+                for (std::size_t i = 0; i < simd_rest; ++i)
+                {
+                    *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+                    res_stepper.step_leading();
+                    fct_stepper.step_leading();
+                }
 
-            fct_stepper.to_begin();
+                // next unaligned index
+                is_row_major
+                    ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
+                    : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
 
-            // need to step E1 as well if not contigous assign (e.g. view)
-            if (!E1::contiguous_layout)
-            {
+                // need to step E1 as well if not contigous assign (e.g. view)
+                fct_stepper.to_begin();
                 res_stepper.to_begin();
                 for (std::size_t i = 0; i < idx.size(); ++i)
                 {
@@ -1242,22 +1203,129 @@ namespace xt
                     res_stepper.step(i + step_dim, idx[i]);
                 }
             }
-            else
+        }
+        else
+        {
+#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
+        if (outer_loop_size > 20)
+        {
+            tbb::static_partitioner sp;
+            tbb::parallel_for(
+                tbb::blocked_range<size_t>(0ul, outer_loop_size),
+                [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_ = idx](
+                    const tbb::blocked_range<size_t>& r
+                )
+                {
+                    auto idx = idx_;
+                    auto fct_stepper = e2.stepper_begin(e1.shape());
+                    auto res_stepper = e1.stepper_begin(e1.shape());
+                    std::size_t first_step = true;
+                    // #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper,
+                    // res_stepper, idx)
+                    for (std::size_t ox = r.begin(); ox < r.end(); ++ox)
+                    {
+                        if (first_step)
+                        {
+                            is_row_major
+                                ? strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx, max_shape)
+                                : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(
+                                    ox,
+                                    idx,
+                                    max_shape
+                                );
+
+                            for (std::size_t i = 0; i < idx.size(); ++i)
+                            {
+                                fct_stepper.step(i + step_dim, idx[i]);
+                                res_stepper.step(i + step_dim, idx[i]);
+                            }
+                            first_step = false;
+                        }
+
+                        for (std::size_t i = 0; i < simd_size; ++i)
+                        {
+                            res_stepper.template store_simd<simd_type>(
+                                fct_stepper.template step_simd<value_type>()
+                            );
+                        }
+                        for (std::size_t i = 0; i < simd_rest; ++i)
+                        {
+                            *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+                            res_stepper.step_leading();
+                            fct_stepper.step_leading();
+                        }
+
+                        // next unaligned index
+                        is_row_major
+                            ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
+                            : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+
+                        // need to step E1 as well if not contigous assign (e.g. view)
+                        fct_stepper.to_begin();
+                        res_stepper.to_begin();
+                        for (std::size_t i = 0; i < idx.size(); ++i)
+                        {
+                            fct_stepper.step(i + step_dim, idx[i]);
+                            res_stepper.step(i + step_dim, idx[i]);
+                        }
+                    }
+                },
+                sp
+            );
+        }
+        else
+        {
+
+#endif
+            for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
             {
-                for (std::size_t i = 0; i < idx.size(); ++i)
+                for (std::size_t i = 0; i < simd_size; ++i)
                 {
-                    fct_stepper.step(i + step_dim, idx[i]);
+                    res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
+                }
+                for (std::size_t i = 0; i < simd_rest; ++i)
+                {
+                    *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+                    res_stepper.step_leading();
+                    fct_stepper.step_leading();
+                }
+
+                is_row_major
+                    ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
+                    : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+
+                fct_stepper.to_begin();
+
+                // need to step E1 as well if not contigous assign (e.g. view)
+                if (!E1::contiguous_layout)
+                {
+                    res_stepper.to_begin();
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                        res_stepper.step(i + step_dim, idx[i]);
+                    }
+                }
+                else
+                {
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                    }
                 }
             }
-        }
 #if (defined(XTENSOR_USE_OPENMP) || defined(XTENSOR_USE_TBB)) && defined(strided_parallel_assign)
-		}
+        }
 #endif
-	}
+    }
 
     template <>
     template <class E1, class E2>
-    inline void strided_loop_assigner<false>::run(E1& /*e1*/, const E2& /*e2*/)
+    inline void strided_loop_assigner<false>::run(
+        E1& /*e1*/,
+        const E2& /*e2*/,
+        loop_sizes_t const &
+    )
     {
     }
 }
diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp
index f5a79d3d7..792af6f82 100644
--- a/include/xtensor/xcontainer.hpp
+++ b/include/xtensor/xcontainer.hpp
@@ -924,9 +924,24 @@ namespace xt
     inline bool xstrided_container<D>::is_contiguous() const noexcept
     {
         using str_type = typename inner_strides_type::value_type;
-        return is_contiguous_container<storage_type>::value
-               && (m_strides.empty() || (m_layout == layout_type::row_major && m_strides.back() == str_type(1))
-                   || (m_layout == layout_type::column_major && m_strides.front() == str_type(1)));
+        auto is_zero = [](auto i){return i == 0;};
+        // This is unsafe if the strides are all 0. Does that happen?
+        if (!is_contiguous_container<storage_type>::value)
+            return false;
+        if (m_layout == layout_type::row_major)
+        {
+            auto it = std::find_if_not(m_strides.rbegin(), m_strides.rend(), is_zero);
+            // If the array has strides of zero, it is a constant, and therefore contiguous.
+            return it == m_strides.rend() || *it == str_type(1);
+        }
+        else if (m_layout == layout_type::column_major)
+        {
+            auto it = std::find_if_not(m_strides.begin(), m_strides.end(), is_zero);
+            // If the array has strides of zero, it is a constant, and therefore contiguous.
+            return it == m_strides.end() || *it == str_type(1);
+        } else {
+            m_strides.empty();
+        }
     }
 
     namespace detail
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7f696e6a5..0ce881944 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -173,6 +173,7 @@ set(XTENSOR_TESTS
     main.cpp
     test_xaccumulator.cpp
     test_xadapt.cpp
+    test_strided_assign.cpp
     test_xassign.cpp
     test_xaxis_iterator.cpp
     test_xaxis_slice_iterator.cpp
diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp
new file mode 100644
index 000000000..44ab98c07
--- /dev/null
+++ b/test/test_strided_assign.cpp
@@ -0,0 +1,114 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
+ * Copyright (c) QuantStack                                                 *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include <type_traits>
+#include <vector>
+
+#include "xtensor/xadapt.hpp"
+#include "xtensor/xarray.hpp"
+#include "xtensor/xassign.hpp"
+#include "xtensor/xnoalias.hpp"
+#include "xtensor/xtensor.hpp"
+#include "xtensor/xview.hpp"
+
+#include "test_common.hpp"
+#include "test_common_macros.hpp"
+
+namespace xt
+{
+    TEST(xassign_strided, mix_shape_types)
+    {
+        auto check_linear_assign = [](auto a, auto b) {
+            assert_compatible_shape(a, b);
+            return xassign_traits<decltype(a), decltype(b)>::linear_assign(a, b, true);
+        };
+        auto check_strided_assign = [](auto a, auto b) {
+            assert_compatible_shape(a, b);
+            static_assert(xassign_traits<decltype(a), decltype(b)>::strided_assign(), "Failed to do strided assign");
+            return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign();
+        };
+        {
+            auto data = std::vector<double>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+            auto simple_xtensor_12 = xt::xtensor<double, 2>{{1,2,3,4},{5,6,7,8},{9,10,11,12}};
+            auto simple_xtensor_16 = xt::xtensor<double, 2>{{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16}};
+            auto adapter_strided_noncont = xt::adapt(
+                data.data(),
+                12,
+                xt::no_ownership(),
+                std::vector<size_t>{4, 3},
+                std::vector<size_t>{4, 1}
+            );
+            auto adapter_strided_cont = xt::adapt(
+                    data.data() + 1,
+                    12,
+                    xt::no_ownership(),
+                    std::vector<size_t>{3, 4},
+                    std::vector<size_t>{4, 1}
+            );
+            auto adapter_strided_noncont_43 = xt::adapt(data.data(), 12, xt::no_ownership(), std::vector<size_t>{3,4},
+                                                        std::vector<size_t>{5,1});
+            EXPECT_FALSE(check_strided_assign(adapter_strided_noncont, xt::transpose(adapter_strided_cont)));
+            auto contiguous_view = xt::view(simple_xtensor_12, xt::range(0,3), xt::all());
+            EXPECT_TRUE(check_linear_assign(contiguous_view, adapter_strided_cont));
+            EXPECT_FALSE(check_linear_assign(contiguous_view, xt::transpose(adapter_strided_noncont)));
+            EXPECT_FALSE(check_linear_assign(xt::transpose(adapter_strided_noncont), contiguous_view));
+            EXPECT_FALSE(check_linear_assign(contiguous_view, adapter_strided_noncont_43));
+            EXPECT_FALSE(check_linear_assign(adapter_strided_noncont_43, contiguous_view));
+            EXPECT_TRUE(check_strided_assign(contiguous_view, adapter_strided_noncont_43));
+            EXPECT_TRUE(check_strided_assign(adapter_strided_noncont_43, contiguous_view));
+            auto contiguous_view2 = xt::view(simple_xtensor_12, xt::range(1,3), xt::all());
+            auto contiguous_view3 = xt::view(simple_xtensor_12, xt::range(0,2), xt::all());
+            EXPECT_TRUE(check_linear_assign(contiguous_view2, contiguous_view3));
+            {
+                auto view_noncont = xt::view(simple_xtensor_16, xt::all(), xt::range(0,3));
+                EXPECT_FALSE(check_linear_assign(adapter_strided_noncont, view_noncont));
+                EXPECT_TRUE(check_strided_assign(adapter_strided_noncont, view_noncont));
+            }
+
+            {
+                std::vector<double> data2{-1, -1, -1, -1};
+                auto linear_adapter = xt::adapt(data2.data(), 4, xt::no_ownership(), std::vector<size_t>{4, 1});
+                auto adapter_cont2 = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{1, 4});
+                EXPECT_TRUE(linear_adapter.is_contiguous());
+                EXPECT_TRUE(adapter_cont2.is_contiguous());
+                bool success_one = check_linear_assign(linear_adapter, xt::transpose(adapter_cont2));
+                bool success_two = check_linear_assign(adapter_cont2, xt::transpose(linear_adapter));
+                EXPECT_TRUE(success_one && success_two);
+                auto adapter_noncont_singlecol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{4,1},std::vector<size_t>{4,1});
+                EXPECT_FALSE(check_linear_assign(adapter_noncont_singlecol, linear_adapter));
+                EXPECT_FALSE(check_linear_assign(linear_adapter, adapter_noncont_singlecol));
+                EXPECT_FALSE(check_strided_assign(adapter_noncont_singlecol, linear_adapter));
+                EXPECT_FALSE(check_strided_assign(linear_adapter, adapter_noncont_singlecol));
+                auto adapter_noncont_twocol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{2,2},std::vector<size_t>{4,1});
+                EXPECT_FALSE(check_linear_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2})));
+                EXPECT_FALSE(check_linear_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol));
+                EXPECT_TRUE(check_strided_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2})));
+                EXPECT_TRUE(check_strided_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol));
+                auto adapter_zero_strides = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{2,2}, std::vector<size_t>{0,0});
+                EXPECT_FALSE(check_linear_assign(adapter_zero_strides, adapter_zero_strides));
+                EXPECT_FALSE(check_strided_assign(adapter_zero_strides, adapter_zero_strides));
+                EXPECT_FALSE(check_linear_assign(adapter_zero_strides, linear_adapter));
+            }
+
+            {
+                std::vector<double> data2{-1, -1, -1, -1, -1, -1};
+                auto linear_adapter2 = xt::adapt(data2.data(), 6, xt::no_ownership(), std::vector<size_t>{3, 2});
+                auto strided_adapter = xt::adapt(data.data(), 6, xt::no_ownership(), std::vector<size_t>{3, 2},
+                                                 std::vector<size_t>{4, 1});
+                EXPECT_FALSE(check_linear_assign(linear_adapter2, strided_adapter));
+                EXPECT_FALSE(check_linear_assign(strided_adapter, linear_adapter2));
+                EXPECT_TRUE(check_strided_assign(linear_adapter2, strided_adapter));
+                EXPECT_TRUE(check_strided_assign(strided_adapter, linear_adapter2));
+                xt::noalias(strided_adapter) = linear_adapter2;
+                auto result_expected = std::vector<double>{-1, -1, 3, 4, -1, -1, 7, 8, -1, -1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+                EXPECT_EQ(data, result_expected);
+            }
+        }
+    }
+}

From 1cc5b59521dd40510429864767b2903eb1800185 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Thu, 23 Feb 2023 18:32:28 +0100
Subject: [PATCH 17/33] Revert some unnecessary changes again

---
 include/xtensor/xstrides.hpp |  2 --
 include/xtensor/xview.hpp    | 64 ++++++++++++++----------------------
 test/test_strided_assign.cpp |  9 +++++
 3 files changed, 33 insertions(+), 42 deletions(-)

diff --git a/include/xtensor/xstrides.hpp b/include/xtensor/xstrides.hpp
index 99ffa811c..ae261e354 100644
--- a/include/xtensor/xstrides.hpp
+++ b/include/xtensor/xstrides.hpp
@@ -544,10 +544,8 @@ namespace xt
         {
             for (std::size_t i = strides.size(); i != 0; --i)
             {
-// 				std::cout << "matching: " << strides[i - 1] << " && " << shape[i - 1]<< " && " << data_size << std::endl;
                 if (!stride_match_condition(strides[i - 1], shape[i - 1], data_size, zero_strides))
                 {
-// 					std::cout << "non-matching: " << strides[i - 1] << " != " << data_size << std::endl;
                     return false;
                 }
                 data_size *= static_cast<value_type>(shape[i - 1]);
diff --git a/include/xtensor/xview.hpp b/include/xtensor/xview.hpp
index 52b7c33c0..0a2dbb6be 100644
--- a/include/xtensor/xview.hpp
+++ b/include/xtensor/xview.hpp
@@ -184,23 +184,15 @@ namespace xt
                                                      ? is_all_slice
                                                      : (!range_seen && (is_int_slice || is_range_slice)));
 
-            static constexpr bool value = is_contiguous_view_impl<layout_type::row_major,
-                                                                  is_valid,
-                                                                  have_all_seen,
-                                                                  range_seen || is_range_slice,
-                                                                  xtl::mpl::pop_front_t<V>>::value;
-			static constexpr size_t contiguous_dimensions = is_contiguous_view_impl<layout_type::row_major,
-                                                                  is_valid,
-                                                                  have_all_seen,
-                                                                  range_seen || is_range_slice,
-                                                                  xtl::mpl::pop_front_t<V>>::contiguous_dimensions + (value ? 1 :0);
+            static constexpr bool value = is_contiguous_view_impl < layout_type::row_major, is_valid,
+                                  have_all_seen, range_seen || is_range_slice,
+                                  xtl::mpl::pop_front_t < V >> ::value;
         };
 
         template <bool valid, bool all_seen, bool range_seen>
         struct is_contiguous_view_impl<layout_type::row_major, valid, all_seen, range_seen, xtl::mpl::vector<>>
         {
             static constexpr bool value = valid;
-			static constexpr size_t contiguous_dimensions = 0;
         };
 
         // For column major the *same* but reverse is true -- with the additional
@@ -984,41 +976,33 @@ namespace xt
         return xtl::mpl::static_if<is_strided_view>(
             [&](auto self)
             {
-				return self(this)->m_e.layout()==layout_type::row_major ?
-					((self(this)->strides()[self(this)->strides().size()-1] == 1) ? self(this)->m_e.layout() : layout_type::dynamic) :
-					((self(this)->strides()[0] == 1) ? self(this)->m_e.layout() : layout_type::dynamic);
-					
-                bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true);
-                return strides_match ? self(this)->m_e.layout() : layout_type::dynamic;
+                if (static_layout != layout_type::dynamic)
+                {
+                    return static_layout;
+                }
+                else
+                {
+                    bool strides_match = do_strides_match(
+                        self(this)->shape(),
+                        self(this)->strides(),
+                        self(this)->m_e.layout(),
+                        true
+                    );
+                    return strides_match ? self(this)->m_e.layout() : layout_type::dynamic;
+                }
             },
-        /* else */ [&](auto /*self*/)
-        {
-			std::cout << "View is not strided!" << std::endl;
-            return layout_type::dynamic;
-        });
+            /* else */
+            [&](auto /*self*/)
+            {
+                return layout_type::dynamic;
+            }
+        );
     }
 
     template <class CT, class... S>
     inline bool xview<CT, S...>::is_contiguous() const noexcept
     {
-		return xtl::mpl::static_if<is_strided_view>([&](auto self)
-        {
-            if (static_layout != layout_type::dynamic)
-            {
-                return true;
-            }
-            else
-            {
-                bool strides_match = do_strides_match(self(this)->shape(), self(this)->strides(), self(this)->m_e.layout(), true);
-                return strides_match ? self(this)->m_e.layout()!=layout_type::dynamic : false;
-            }
-        },
-        /* else */ [&](auto /*self*/)
-        {
-            return false;
-        });
-		
-//         return layout() != layout_type::dynamic;
+        return layout() != layout_type::dynamic;
     }
 
     //@}
diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp
index 44ab98c07..c89e5cecd 100644
--- a/test/test_strided_assign.cpp
+++ b/test/test_strided_assign.cpp
@@ -70,6 +70,15 @@ namespace xt
                 EXPECT_FALSE(check_linear_assign(adapter_strided_noncont, view_noncont));
                 EXPECT_TRUE(check_strided_assign(adapter_strided_noncont, view_noncont));
             }
+            {
+                auto view_cont = xt::view(simple_xtensor_16, xt::range(0,1), xt::range(0,4));
+                auto view_cont2 = xt::view(simple_xtensor_16, xt::range(3,4), xt::range(0,4));
+                EXPECT_TRUE(check_linear_assign(view_cont, view_cont2));
+                auto view_noncont = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(0,3));
+                auto view_noncont2 = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(1,4));
+                EXPECT_FALSE(check_linear_assign(view_noncont, view_noncont2));
+                EXPECT_TRUE(check_strided_assign(view_noncont, view_noncont2));
+            }
 
             {
                 std::vector<double> data2{-1, -1, -1, -1};

From 011d8433aa90244353c433f37181ef953b273512 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 17:34:23 +0100
Subject: [PATCH 18/33] Add tbb to benchmark

---
 benchmark/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 51228749f..6b98e63f8 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -136,6 +136,15 @@ set(XTENSOR_BENCHMARK_TARGET benchmark_xtensor)
 add_executable(${XTENSOR_BENCHMARK_TARGET} EXCLUDE_FROM_ALL ${XTENSOR_BENCHMARK} ${XTENSOR_HEADERS})
 target_link_libraries(${XTENSOR_BENCHMARK_TARGET} xtensor ${GBENCHMARK_LIBRARIES})
 
+if(XTENSOR_USE_TBB)
+    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_TBB)
+    target_include_directories(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_INCLUDE_DIRS})
+    target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_LIBRARIES})
+endif()
+if(XTENSOR_USE_OPENMP)
+    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_OPENMP)
+endif()
+
 add_custom_target(xbenchmark
     COMMAND benchmark_xtensor
     DEPENDS ${XTENSOR_BENCHMARK_TARGET})

From 7c179a5d2931b8b31548eb86784bdc2e65b6ba16 Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sat, 25 Feb 2023 23:39:59 +0100
Subject: [PATCH 19/33] Fix typo

---
 include/xtensor/xcontainer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp
index 792af6f82..41c40be1c 100644
--- a/include/xtensor/xcontainer.hpp
+++ b/include/xtensor/xcontainer.hpp
@@ -940,7 +940,7 @@ namespace xt
             // If the array has strides of zero, it is a constant, and therefore contiguous.
             return it == m_strides.end() || *it == str_type(1);
         } else {
-            m_strides.empty();
+            return m_strides.empty();
         }
     }
 

From 4c49f91a1c6e43d0e1020fa7cceea255ffdfee2e Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 17:12:40 +0100
Subject: [PATCH 20/33] Fix a few things and refactor to amke things
 backward-compatible

---
 include/xtensor/xassign.hpp  |  80 ++++++++++++++++++---------
 test/test_strided_assign.cpp | 103 ++++++++++++++++++++++++-----------
 test/test_xassign.cpp        |  10 ++++
 3 files changed, 135 insertions(+), 58 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 4a68827b6..dc3302495 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -172,29 +172,32 @@ namespace xt
      * strided_loop_assigner *
      *************************/
 
-    namespace strided_assign_detail {
-        struct loop_sizes_t {
+    namespace strided_assign_detail
+    {
+        struct loop_sizes_t
+        {
+            bool can_do_strided_assign;
             bool is_row_major;
             std::size_t inner_loop_size;
             std::size_t outer_loop_size;
             std::size_t cut;
             std::size_t dimension;
-            inline bool can_do_strided_assign() { // Is it actually useful to do a strided assign?
-                return inner_loop_size > 1;
-                //return !((is_row_major && cut == dimension) || (!is_row_major && cut == 0)) && inner_loop_size != 1;
-            };
         };
     }
+
     template <bool simd>
     class strided_loop_assigner
     {
     public:
+
         using loop_sizes_t = strided_assign_detail::loop_sizes_t;
         // is_row_major, inner_loop_size, outer_loop_size, cut
         template <class E1, class E2>
-        static void run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes);
+        static void run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes);
         template <class E1, class E2>
         static loop_sizes_t get_loop_sizes(E1& e1, const E2& e2);
+        template <class E1, class E2>
+        static void run(E1& e1, const E2& e2);
     };
 
     /***********************************
@@ -471,11 +474,7 @@ namespace xt
         {
             if (simd_strided_assign)
             {
-                strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner<simd_strided_assign>::get_loop_sizes(de1, de2);
-                if (loop_sizes.can_do_strided_assign())
-                    strided_loop_assigner<simd_strided_assign>::run(de1, de2, loop_sizes);
-                else
-                    stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(de1, de2).run();
+                strided_loop_assigner<simd_strided_assign>::run(de1, de2);
             }
             else
             {
@@ -1024,7 +1023,13 @@ namespace xt
             const strides_type& m_strides;
         };
 
-        template <class E1, class E2>
+        template <bool possible = true, class E1, class E2, std::enable_if_t<!has_strides<E1>::value || !has_strides<E2>::value || !possible, bool> = true>
+        loop_sizes_t get_loop_sizes(const E1& e1, const E2&)
+        {
+            return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
+        }
+
+        template <bool possible = true, class E1, class E2, std::enable_if_t<has_strides<E1>::value && has_strides<E2>::value && possible, bool> = true>
         loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2)
         {
             using shape_value_type = typename E1::shape_type::value_type;
@@ -1033,8 +1038,11 @@ namespace xt
             // Try to find a row-major scheme first, where the outer loop is on the first N = `cut`
             // dimensions, and the inner loop is
             is_row_major = true;
-            auto is_zero = [](auto i){return i == 0;};
-            auto &&strides = e1.strides();
+            auto is_zero = [](auto i)
+            {
+                return i == 0;
+            };
+            auto&& strides = e1.strides();
             auto it_bwd = std::find_if_not(strides.rbegin(), strides.rend(), is_zero);
             bool de1_row_contiguous = it_bwd != strides.rend() && *it_bwd == 1;
             auto it_fwd = std::find_if_not(strides.begin(), strides.end(), is_zero);
@@ -1050,7 +1058,7 @@ namespace xt
             else
             {
                 // No strided loop possible.
-                return {true, 1, e1.size(), e1.dimension(), e1.dimension()};
+                return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
             }
 
             // Cut is the number of dimensions in the outer loop
@@ -1069,7 +1077,8 @@ namespace xt
             }
             else if (!is_row_major)
             {
-                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides());
+                auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
+                );
                 cut = csf(e2);
                 if (cut > 1)
                 {
@@ -1096,7 +1105,7 @@ namespace xt
                 std::swap(outer_loop_size, inner_loop_size);
             }
 
-            return {is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()};
+            return {inner_loop_size > 1, is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()};
         }
     }
 
@@ -1104,14 +1113,14 @@ namespace xt
     template <class E1, class E2>
     inline strided_assign_detail::loop_sizes_t strided_loop_assigner<simd>::get_loop_sizes(E1& e1, const E2& e2)
     {
-        return strided_assign_detail::get_loop_sizes(e1, e2);
+        return strided_assign_detail::get_loop_sizes<simd>(e1, e2);
     }
 
 #define strided_parallel_assign
 
     template <bool simd>
     template <class E1, class E2>
-    inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2, const loop_sizes_t &loop_sizes)
+    inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes)
     {
         bool is_row_major = loop_sizes.is_row_major;
         std::size_t inner_loop_size = loop_sizes.inner_loop_size;
@@ -1244,7 +1253,7 @@ namespace xt
 
                         for (std::size_t i = 0; i < simd_size; ++i)
                         {
-                            res_stepper.template store_simd<simd_type>(
+                            res_stepper.template store_simd(
                                 fct_stepper.template step_simd<value_type>()
                             );
                         }
@@ -1318,15 +1327,34 @@ namespace xt
         }
 #endif
     }
+    template <>
+    template <class E1, class E2>
+    inline void strided_loop_assigner<true>::run(E1& e1, const E2& e2)
+    {
+        strided_assign_detail::loop_sizes_t
+            loop_sizes = strided_loop_assigner<true>::get_loop_sizes(e1, e2);
+        if (loop_sizes.can_do_strided_assign)
+        {
+            run(e1, e2, loop_sizes);
+        }
+        else
+        {
+            // trigger the fallback assigner
+            stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(e1, e2).run();
+        }
+    }
 
     template <>
     template <class E1, class E2>
-    inline void strided_loop_assigner<false>::run(
-        E1& /*e1*/,
-        const E2& /*e2*/,
-        loop_sizes_t const &
-    )
+    inline void strided_loop_assigner<false>::run(E1& /*e1*/, const E2& /*e2*/, const loop_sizes_t&)
+    {
+    }
+    template <>
+    template <class E1, class E2>
+    inline void strided_loop_assigner<false>::run(E1& e1, const E2& e2)
     {
+        // trigger the fallback assigner
+        stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(e1, e2).run();
     }
 }
 
diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp
index c89e5cecd..4c91534fb 100644
--- a/test/test_strided_assign.cpp
+++ b/test/test_strided_assign.cpp
@@ -24,19 +24,29 @@ namespace xt
 {
     TEST(xassign_strided, mix_shape_types)
     {
-        auto check_linear_assign = [](auto a, auto b) {
+        auto check_linear_assign = [](auto a, auto b)
+        {
             assert_compatible_shape(a, b);
             return xassign_traits<decltype(a), decltype(b)>::linear_assign(a, b, true);
         };
-        auto check_strided_assign = [](auto a, auto b) {
+        auto check_strided_assign = [](auto a, auto b)
+        {
             assert_compatible_shape(a, b);
-            static_assert(xassign_traits<decltype(a), decltype(b)>::strided_assign(), "Failed to do strided assign");
-            return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign();
+            static_assert(
+                xassign_traits<decltype(a), decltype(b)>::strided_assign(),
+                "Failed to do strided assign"
+            );
+            return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign;
         };
         {
-            auto data = std::vector<double>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-            auto simple_xtensor_12 = xt::xtensor<double, 2>{{1,2,3,4},{5,6,7,8},{9,10,11,12}};
-            auto simple_xtensor_16 = xt::xtensor<double, 2>{{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16}};
+            auto data = std::vector<double>{
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+            auto simple_xtensor_12 = xt::xtensor<double, 2>{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+            auto simple_xtensor_16 = xt::xtensor<double, 2>{
+                {1, 2, 3, 4},
+                {5, 6, 7, 8},
+                {9, 10, 11, 12},
+                {13, 14, 15, 16}};
             auto adapter_strided_noncont = xt::adapt(
                 data.data(),
                 12,
@@ -45,16 +55,21 @@ namespace xt
                 std::vector<size_t>{4, 1}
             );
             auto adapter_strided_cont = xt::adapt(
-                    data.data() + 1,
-                    12,
-                    xt::no_ownership(),
-                    std::vector<size_t>{3, 4},
-                    std::vector<size_t>{4, 1}
+                data.data() + 1,
+                12,
+                xt::no_ownership(),
+                std::vector<size_t>{3, 4},
+                std::vector<size_t>{4, 1}
+            );
+            auto adapter_strided_noncont_43 = xt::adapt(
+                data.data(),
+                12,
+                xt::no_ownership(),
+                std::vector<size_t>{3, 4},
+                std::vector<size_t>{5, 1}
             );
-            auto adapter_strided_noncont_43 = xt::adapt(data.data(), 12, xt::no_ownership(), std::vector<size_t>{3,4},
-                                                        std::vector<size_t>{5,1});
             EXPECT_FALSE(check_strided_assign(adapter_strided_noncont, xt::transpose(adapter_strided_cont)));
-            auto contiguous_view = xt::view(simple_xtensor_12, xt::range(0,3), xt::all());
+            auto contiguous_view = xt::view(simple_xtensor_12, xt::range(0, 3), xt::all());
             EXPECT_TRUE(check_linear_assign(contiguous_view, adapter_strided_cont));
             EXPECT_FALSE(check_linear_assign(contiguous_view, xt::transpose(adapter_strided_noncont)));
             EXPECT_FALSE(check_linear_assign(xt::transpose(adapter_strided_noncont), contiguous_view));
@@ -62,20 +77,20 @@ namespace xt
             EXPECT_FALSE(check_linear_assign(adapter_strided_noncont_43, contiguous_view));
             EXPECT_TRUE(check_strided_assign(contiguous_view, adapter_strided_noncont_43));
             EXPECT_TRUE(check_strided_assign(adapter_strided_noncont_43, contiguous_view));
-            auto contiguous_view2 = xt::view(simple_xtensor_12, xt::range(1,3), xt::all());
-            auto contiguous_view3 = xt::view(simple_xtensor_12, xt::range(0,2), xt::all());
+            auto contiguous_view2 = xt::view(simple_xtensor_12, xt::range(1, 3), xt::all());
+            auto contiguous_view3 = xt::view(simple_xtensor_12, xt::range(0, 2), xt::all());
             EXPECT_TRUE(check_linear_assign(contiguous_view2, contiguous_view3));
             {
-                auto view_noncont = xt::view(simple_xtensor_16, xt::all(), xt::range(0,3));
+                auto view_noncont = xt::view(simple_xtensor_16, xt::all(), xt::range(0, 3));
                 EXPECT_FALSE(check_linear_assign(adapter_strided_noncont, view_noncont));
                 EXPECT_TRUE(check_strided_assign(adapter_strided_noncont, view_noncont));
             }
             {
-                auto view_cont = xt::view(simple_xtensor_16, xt::range(0,1), xt::range(0,4));
-                auto view_cont2 = xt::view(simple_xtensor_16, xt::range(3,4), xt::range(0,4));
+                auto view_cont = xt::view(simple_xtensor_16, xt::range(0, 1), xt::range(0, 4));
+                auto view_cont2 = xt::view(simple_xtensor_16, xt::range(3, 4), xt::range(0, 4));
                 EXPECT_TRUE(check_linear_assign(view_cont, view_cont2));
-                auto view_noncont = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(0,3));
-                auto view_noncont2 = xt::view(simple_xtensor_16, xt::range(0,4), xt::range(1,4));
+                auto view_noncont = xt::view(simple_xtensor_16, xt::range(0, 4), xt::range(0, 3));
+                auto view_noncont2 = xt::view(simple_xtensor_16, xt::range(0, 4), xt::range(1, 4));
                 EXPECT_FALSE(check_linear_assign(view_noncont, view_noncont2));
                 EXPECT_TRUE(check_strided_assign(view_noncont, view_noncont2));
             }
@@ -89,17 +104,35 @@ namespace xt
                 bool success_one = check_linear_assign(linear_adapter, xt::transpose(adapter_cont2));
                 bool success_two = check_linear_assign(adapter_cont2, xt::transpose(linear_adapter));
                 EXPECT_TRUE(success_one && success_two);
-                auto adapter_noncont_singlecol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{4,1},std::vector<size_t>{4,1});
+                auto adapter_noncont_singlecol = xt::adapt(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{4, 1},
+                    std::vector<size_t>{4, 1}
+                );
                 EXPECT_FALSE(check_linear_assign(adapter_noncont_singlecol, linear_adapter));
                 EXPECT_FALSE(check_linear_assign(linear_adapter, adapter_noncont_singlecol));
                 EXPECT_FALSE(check_strided_assign(adapter_noncont_singlecol, linear_adapter));
                 EXPECT_FALSE(check_strided_assign(linear_adapter, adapter_noncont_singlecol));
-                auto adapter_noncont_twocol = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{2,2},std::vector<size_t>{4,1});
-                EXPECT_FALSE(check_linear_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2})));
-                EXPECT_FALSE(check_linear_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol));
-                EXPECT_TRUE(check_strided_assign(adapter_noncont_twocol, linear_adapter.reshape({2,2})));
-                EXPECT_TRUE(check_strided_assign(linear_adapter.reshape({2,2}), adapter_noncont_twocol));
-                auto adapter_zero_strides = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{2,2}, std::vector<size_t>{0,0});
+                auto adapter_noncont_twocol = xt::adapt(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{2, 2},
+                    std::vector<size_t>{4, 1}
+                );
+                EXPECT_FALSE(check_linear_assign(adapter_noncont_twocol, linear_adapter.reshape({2, 2})));
+                EXPECT_FALSE(check_linear_assign(linear_adapter.reshape({2, 2}), adapter_noncont_twocol));
+                EXPECT_TRUE(check_strided_assign(adapter_noncont_twocol, linear_adapter.reshape({2, 2})));
+                EXPECT_TRUE(check_strided_assign(linear_adapter.reshape({2, 2}), adapter_noncont_twocol));
+                auto adapter_zero_strides = xt::adapt(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{2, 2},
+                    std::vector<size_t>{0, 0}
+                );
                 EXPECT_FALSE(check_linear_assign(adapter_zero_strides, adapter_zero_strides));
                 EXPECT_FALSE(check_strided_assign(adapter_zero_strides, adapter_zero_strides));
                 EXPECT_FALSE(check_linear_assign(adapter_zero_strides, linear_adapter));
@@ -108,14 +141,20 @@ namespace xt
             {
                 std::vector<double> data2{-1, -1, -1, -1, -1, -1};
                 auto linear_adapter2 = xt::adapt(data2.data(), 6, xt::no_ownership(), std::vector<size_t>{3, 2});
-                auto strided_adapter = xt::adapt(data.data(), 6, xt::no_ownership(), std::vector<size_t>{3, 2},
-                                                 std::vector<size_t>{4, 1});
+                auto strided_adapter = xt::adapt(
+                    data.data(),
+                    6,
+                    xt::no_ownership(),
+                    std::vector<size_t>{3, 2},
+                    std::vector<size_t>{4, 1}
+                );
                 EXPECT_FALSE(check_linear_assign(linear_adapter2, strided_adapter));
                 EXPECT_FALSE(check_linear_assign(strided_adapter, linear_adapter2));
                 EXPECT_TRUE(check_strided_assign(linear_adapter2, strided_adapter));
                 EXPECT_TRUE(check_strided_assign(strided_adapter, linear_adapter2));
                 xt::noalias(strided_adapter) = linear_adapter2;
-                auto result_expected = std::vector<double>{-1, -1, 3, 4, -1, -1, 7, 8, -1, -1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+                auto result_expected = std::vector<double>{
+                    -1, -1, 3, 4, -1, -1, 7, 8, -1, -1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
                 EXPECT_EQ(data, result_expected);
             }
         }
diff --git a/test/test_xassign.cpp b/test/test_xassign.cpp
index 475d0e5cf..4af38bcf1 100644
--- a/test/test_xassign.cpp
+++ b/test/test_xassign.cpp
@@ -9,6 +9,7 @@
 
 #include <type_traits>
 #include <vector>
+#include <iterator>
 
 #include "xtensor/xarray.hpp"
 #include "xtensor/xassign.hpp"
@@ -82,6 +83,15 @@ class my_vector
         return m_data.begin();
     }
 
+    auto rbegin() const
+    {
+        return std::make_reverse_iterator(end());
+    }
+    auto rend() const
+    {
+        return std::make_reverse_iterator(begin());
+    }
+
     auto empty() const
     {
         return m_data.empty();

From 0b3caffb8b3a8363cc322e21d009b3a3540d0578 Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 17:49:49 +0100
Subject: [PATCH 21/33] Use tbb and opnemp also in benchmark

---
 benchmark/CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 6b98e63f8..412d03644 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -134,15 +134,15 @@ set(XTENSOR_BENCHMARK
 
 set(XTENSOR_BENCHMARK_TARGET benchmark_xtensor)
 add_executable(${XTENSOR_BENCHMARK_TARGET} EXCLUDE_FROM_ALL ${XTENSOR_BENCHMARK} ${XTENSOR_HEADERS})
-target_link_libraries(${XTENSOR_BENCHMARK_TARGET} xtensor ${GBENCHMARK_LIBRARIES})
+target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC xtensor ${GBENCHMARK_LIBRARIES})
 
 if(XTENSOR_USE_TBB)
-    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_TBB)
-    target_include_directories(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_INCLUDE_DIRS})
-    target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PRIVATE ${TBB_LIBRARIES})
+    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_USE_TBB)
+    target_include_directories(${XTENSOR_BENCHMARK_TARGET} PUBLIC ${TBB_INCLUDE_DIRS})
+    target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC ${TBB_LIBRARIES})
 endif()
 if(XTENSOR_USE_OPENMP)
-    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PRIVATE XTENSOR_USE_OPENMP)
+    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_USE_OPENMP)
 endif()
 
 add_custom_target(xbenchmark

From 609d8489b0f98058f5ae91da894de33ff127e37b Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 17:50:03 +0100
Subject: [PATCH 22/33] Use the various THRESHOLDs for the strided loops

---
 include/xtensor/xassign.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index dc3302495..68835f047 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1167,7 +1167,7 @@ namespace xt
             step_dim = cut;
         }
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-        if (outer_loop_size > 20)
+        if (outer_loop_size >= XTENSOR_OPENMP_THRESHOLD / inner_loop_size)
         {
             std::size_t first_step = true;
 #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
@@ -1216,7 +1216,7 @@ namespace xt
         else
         {
 #elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
-        if (outer_loop_size > 20)
+        if (outer_loop_size > XTENSOR_TBB_THRESHOLD / inner_loop_size)
         {
             tbb::static_partitioner sp;
             tbb::parallel_for(

From 6a9d20513cada231e941c867766cc3d0955ccf11 Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 18:32:11 +0100
Subject: [PATCH 23/33] Add benchmark for stencil operations

---
 benchmark/benchmark_views.cpp | 48 +++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/benchmark/benchmark_views.cpp b/benchmark/benchmark_views.cpp
index e0f9067d3..13d21366a 100644
--- a/benchmark/benchmark_views.cpp
+++ b/benchmark/benchmark_views.cpp
@@ -195,6 +195,54 @@ namespace xt
         BENCHMARK_TEMPLATE(view_assign_strided_view_noalias, float);
     }
 
+    namespace finite_diff
+    {
+        inline auto stencil_threedirections(benchmark::State& state, size_t size)
+        {
+            for (auto _ : state)
+            {
+                const std::array<size_t, 3> shape = {size, size, size};
+                xt::xtensor<double, 3> a(shape), b(shape);
+                auto core = xt::range(1, size - 1);
+                xt::noalias(xt::view(b, core, core, core)
+                ) = 1.0 / 7.0
+                    * (xt::view(a, core, core, core) + xt::view(a, core, core, xt::range(2, size))
+                       + xt::view(a, core, core, xt::range(0, size - 2))
+                       + xt::view(a, core, xt::range(2, size), core)
+                       + xt::view(a, core, xt::range(0, size - 2), core)
+                       + xt::view(a, xt::range(2, size), core, core)
+                       + xt::view(a, xt::range(0, size - 2), core, core));
+                benchmark::DoNotOptimize(b);
+            }
+        }
+
+        inline auto stencil_onedirection(benchmark::State& state, size_t size)
+        {
+            for (auto _ : state)
+            {
+                const std::array<size_t, 3> shape = {size, size, size};
+                xt::xtensor<double, 3> a(shape), b(shape);
+                auto core = xt::range(1, size - 1);
+                xt::noalias(xt::view(b, core, core, core)
+                ) = 1.0 / 2.0
+                    * (xt::view(a, xt::range(2, size), core, core)
+                       - xt::view(a, xt::range(0, size - 2), core, core));
+                benchmark::DoNotOptimize(b);
+            }
+        }
+
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_50, 50);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_100, 100);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_200, 200);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_300, 300);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_500, 500);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_50, 50);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_100, 100);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_200, 200);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_300, 300);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_500, 500);
+    }
+
     namespace stridedview
     {
 

From 4299283443c3f28ec17e4eec706063bfea8ae997 Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 18:36:29 +0100
Subject: [PATCH 24/33] Add two-direction stencil too

---
 benchmark/benchmark_views.cpp | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/benchmark/benchmark_views.cpp b/benchmark/benchmark_views.cpp
index 13d21366a..db233ec05 100644
--- a/benchmark/benchmark_views.cpp
+++ b/benchmark/benchmark_views.cpp
@@ -216,6 +216,23 @@ namespace xt
             }
         }
 
+        inline auto stencil_twodirections(benchmark::State& state, size_t size)
+        {
+            for (auto _ : state)
+            {
+                const std::array<size_t, 3> shape = {size, size, size};
+                xt::xtensor<double, 3> a(shape), b(shape);
+                auto core = xt::range(1, size - 1);
+                xt::noalias(xt::view(b, core, core, core)
+                ) = 1.0 / 7.0
+                    * (xt::view(a, core, core, core) + xt::view(a, core, xt::range(2, size), core)
+                       + xt::view(a, core, xt::range(0, size - 2), core)
+                       + xt::view(a, xt::range(2, size), core, core)
+                       + xt::view(a, xt::range(0, size - 2), core, core));
+                benchmark::DoNotOptimize(b);
+            }
+        }
+
         inline auto stencil_onedirection(benchmark::State& state, size_t size)
         {
             for (auto _ : state)
@@ -236,6 +253,11 @@ namespace xt
         BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_200, 200);
         BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_300, 300);
         BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_500, 500);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_50, 50);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_100, 100);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_200, 200);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_300, 300);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_500, 500);
         BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_50, 50);
         BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_100, 100);
         BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_200, 200);

From f66ab3f745d6d57a908344e99f6475e43ce0aa3c Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 18:54:08 +0100
Subject: [PATCH 25/33] Add more elaborate view test

---
 test/test_strided_assign.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp
index 4c91534fb..b8cea8571 100644
--- a/test/test_strided_assign.cpp
+++ b/test/test_strided_assign.cpp
@@ -38,6 +38,20 @@ namespace xt
             );
             return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign;
         };
+        {
+            size_t size = 50;
+            const std::array<size_t, 3> shape = {size, size, size};
+            xt::xtensor<double, 3> a(shape), b(shape);
+            auto core = xt::range(1, size - 1);
+            auto lhs = xt::view(b, core, core, core);
+            auto rhs = 1.0 / 7.0
+                       * (xt::view(a, core, core, core) + xt::view(a, core, xt::range(2, size), core)
+                          + xt::view(a, core, xt::range(0, size - 2), core)
+                          + xt::view(a, xt::range(2, size), core, core)
+                          + xt::view(a, xt::range(0, size - 2), core, core));
+            xt::noalias(lhs) = rhs;
+            EXPECT_TRUE(check_strided_assign(lhs, rhs));
+        }
         {
             auto data = std::vector<double>{
                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};

From 32595af23114618b17491dad64fa081c751917b7 Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Sun, 26 Feb 2023 23:57:48 +0100
Subject: [PATCH 26/33] Remove some buggy checks

---
 include/xtensor/xassign.hpp | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 68835f047..48385cce6 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -903,16 +903,8 @@ namespace xt
                 for (size_type i = 0; i < outer_shape.size(); i++)
                 {
                     auto d_idx = n / stride_sizes[i];
-                    if (d_idx < outer_shape[i])
-                    {
-                        outer_index[i] = d_idx;
-                        n -= d_idx * stride_sizes[i];
-                    }
-                    else
-                    {
-                        // that may not happen
-                        outer_index[i] = 0;
-                    }
+                    outer_index[i] = d_idx;
+                    n -= d_idx * stride_sizes[i];
                 }
             }
         };
@@ -959,12 +951,8 @@ namespace xt
                 {
                     i--;
                     auto d_idx = n / stride_sizes[i];
-                    if (d_idx < (outer_shape[i] - 1))
-                    {
-                        outer_index[i] = d_idx;
-                        n -= d_idx * stride_sizes[i];
-                    }
-                    // 					else {}   // that may not happen
+                    outer_index[i] = d_idx;
+                    n -= d_idx * stride_sizes[i];
                 }
             }
         };
@@ -1023,13 +1011,13 @@ namespace xt
             const strides_type& m_strides;
         };
 
-        template <bool possible = true, class E1, class E2, std::enable_if_t<!has_strides<E1>::value || !has_strides<E2>::value || !possible, bool> = true>
+        template <bool possible = true, class E1, class E2, std::enable_if_t<!has_strides<E1>::value || !possible, bool> = true>
         loop_sizes_t get_loop_sizes(const E1& e1, const E2&)
         {
             return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
         }
 
-        template <bool possible = true, class E1, class E2, std::enable_if_t<has_strides<E1>::value && has_strides<E2>::value && possible, bool> = true>
+        template <bool possible = true, class E1, class E2, std::enable_if_t<has_strides<E1>::value && possible, bool> = true>
         loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2)
         {
             using shape_value_type = typename E1::shape_type::value_type;

From df7abfb9010cc38eed6458526f6f526f4d75cc75 Mon Sep 17 00:00:00 2001
From: Ewoud Wempe <wempe@astro.rug.nl>
Date: Mon, 27 Feb 2023 00:06:35 +0100
Subject: [PATCH 27/33] Fix a few more bugs

---
 include/xtensor/xassign.hpp | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 48385cce6..2566bde17 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1155,7 +1155,7 @@ namespace xt
             step_dim = cut;
         }
 #if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
-        if (outer_loop_size >= XTENSOR_OPENMP_THRESHOLD / inner_loop_size)
+        if (outer_loop_size >= XTENSOR_OPENMP_TRESHOLD / inner_loop_size)
         {
             std::size_t first_step = true;
 #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
@@ -1241,9 +1241,7 @@ namespace xt
 
                         for (std::size_t i = 0; i < simd_size; ++i)
                         {
-                            res_stepper.template store_simd(
-                                fct_stepper.template step_simd<value_type>()
-                            );
+                            res_stepper.template store_simd(fct_stepper.template step_simd<value_type>());
                         }
                         for (std::size_t i = 0; i < simd_rest; ++i)
                         {
@@ -1257,13 +1255,24 @@ namespace xt
                             ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
                             : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
 
-                        // need to step E1 as well if not contigous assign (e.g. view)
                         fct_stepper.to_begin();
-                        res_stepper.to_begin();
-                        for (std::size_t i = 0; i < idx.size(); ++i)
+
+                        // need to step E1 as well if not contigous assign (e.g. view)
+                        if (!E1::contiguous_layout)
                         {
-                            fct_stepper.step(i + step_dim, idx[i]);
-                            res_stepper.step(i + step_dim, idx[i]);
+                            res_stepper.to_begin();
+                            for (std::size_t i = 0; i < idx.size(); ++i)
+                            {
+                                fct_stepper.step(i + step_dim, idx[i]);
+                                res_stepper.step(i + step_dim, idx[i]);
+                            }
+                        }
+                        else
+                        {
+                            for (std::size_t i = 0; i < idx.size(); ++i)
+                            {
+                                fct_stepper.step(i + step_dim, idx[i]);
+                            }
                         }
                     }
                 },
@@ -1315,12 +1324,12 @@ namespace xt
         }
 #endif
     }
+
     template <>
     template <class E1, class E2>
     inline void strided_loop_assigner<true>::run(E1& e1, const E2& e2)
     {
-        strided_assign_detail::loop_sizes_t
-            loop_sizes = strided_loop_assigner<true>::get_loop_sizes(e1, e2);
+        strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner<true>::get_loop_sizes(e1, e2);
         if (loop_sizes.can_do_strided_assign)
         {
             run(e1, e2, loop_sizes);

From b2cea0084c907038e74ad1fe6c7219db9feec9cc Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Mon, 27 Feb 2023 14:20:11 +0100
Subject: [PATCH 28/33] run clang-format on xcontainer

---
 include/xtensor/xcontainer.hpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp
index 41c40be1c..26045f6b6 100644
--- a/include/xtensor/xcontainer.hpp
+++ b/include/xtensor/xcontainer.hpp
@@ -924,10 +924,15 @@ namespace xt
     inline bool xstrided_container<D>::is_contiguous() const noexcept
     {
         using str_type = typename inner_strides_type::value_type;
-        auto is_zero = [](auto i){return i == 0;};
+        auto is_zero = [](auto i)
+        {
+            return i == 0;
+        };
         // This is unsafe if the strides are all 0. Does that happen?
         if (!is_contiguous_container<storage_type>::value)
+        {
             return false;
+        }
         if (m_layout == layout_type::row_major)
         {
             auto it = std::find_if_not(m_strides.rbegin(), m_strides.rend(), is_zero);
@@ -939,7 +944,9 @@ namespace xt
             auto it = std::find_if_not(m_strides.begin(), m_strides.end(), is_zero);
             // If the array has strides of zero, it is a constant, and therefore contiguous.
             return it == m_strides.end() || *it == str_type(1);
-        } else {
+        }
+        else
+        {
             return m_strides.empty();
         }
     }

From d84724e02754f4f1fa99026858c8261b5413b816 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Mon, 27 Feb 2023 14:20:41 +0100
Subject: [PATCH 29/33] Run clang-format on xassign & test_xassign

---
 include/xtensor/xassign.hpp | 1 +
 test/test_xassign.cpp       | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 2566bde17..74e307956 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1346,6 +1346,7 @@ namespace xt
     inline void strided_loop_assigner<false>::run(E1& /*e1*/, const E2& /*e2*/, const loop_sizes_t&)
     {
     }
+
     template <>
     template <class E1, class E2>
     inline void strided_loop_assigner<false>::run(E1& e1, const E2& e2)
diff --git a/test/test_xassign.cpp b/test/test_xassign.cpp
index 4af38bcf1..5367a32da 100644
--- a/test/test_xassign.cpp
+++ b/test/test_xassign.cpp
@@ -7,9 +7,9 @@
  * The full license is in the file LICENSE, distributed with this software. *
  ****************************************************************************/
 
+#include <iterator>
 #include <type_traits>
 #include <vector>
-#include <iterator>
 
 #include "xtensor/xarray.hpp"
 #include "xtensor/xassign.hpp"
@@ -87,6 +87,7 @@ class my_vector
     {
         return std::make_reverse_iterator(end());
     }
+
     auto rend() const
     {
         return std::make_reverse_iterator(begin());

From 68a5a9efbae2c150c65f11437578a7002a523186 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Mon, 27 Feb 2023 14:42:16 +0100
Subject: [PATCH 30/33] Fix for column layout default

---
 test/test_strided_assign.cpp | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp
index b8cea8571..800764258 100644
--- a/test/test_strided_assign.cpp
+++ b/test/test_strided_assign.cpp
@@ -13,6 +13,7 @@
 #include "xtensor/xadapt.hpp"
 #include "xtensor/xarray.hpp"
 #include "xtensor/xassign.hpp"
+#include "xtensor/xlayout.hpp"
 #include "xtensor/xnoalias.hpp"
 #include "xtensor/xtensor.hpp"
 #include "xtensor/xview.hpp"
@@ -55,8 +56,11 @@ namespace xt
         {
             auto data = std::vector<double>{
                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-            auto simple_xtensor_12 = xt::xtensor<double, 2>{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
-            auto simple_xtensor_16 = xt::xtensor<double, 2>{
+            auto simple_xtensor_12 = xt::xtensor<double, 2, layout_type::row_major>{
+                {1, 2, 3, 4},
+                {5, 6, 7, 8},
+                {9, 10, 11, 12}};
+            auto simple_xtensor_16 = xt::xtensor<double, 2, layout_type::row_major>{
                 {1, 2, 3, 4},
                 {5, 6, 7, 8},
                 {9, 10, 11, 12},
@@ -111,8 +115,18 @@ namespace xt
 
             {
                 std::vector<double> data2{-1, -1, -1, -1};
-                auto linear_adapter = xt::adapt(data2.data(), 4, xt::no_ownership(), std::vector<size_t>{4, 1});
-                auto adapter_cont2 = xt::adapt(data.data(), 4, xt::no_ownership(), std::vector<size_t>{1, 4});
+                auto linear_adapter = xt::adapt<layout_type::row_major>(
+                    data2.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{4, 1}
+                );
+                auto adapter_cont2 = xt::adapt<layout_type::row_major>(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{1, 4}
+                );
                 EXPECT_TRUE(linear_adapter.is_contiguous());
                 EXPECT_TRUE(adapter_cont2.is_contiguous());
                 bool success_one = check_linear_assign(linear_adapter, xt::transpose(adapter_cont2));
@@ -154,7 +168,12 @@ namespace xt
 
             {
                 std::vector<double> data2{-1, -1, -1, -1, -1, -1};
-                auto linear_adapter2 = xt::adapt(data2.data(), 6, xt::no_ownership(), std::vector<size_t>{3, 2});
+                auto linear_adapter2 = xt::adapt<layout_type::row_major>(
+                    data2.data(),
+                    6,
+                    xt::no_ownership(),
+                    std::vector<size_t>{3, 2}
+                );
                 auto strided_adapter = xt::adapt(
                     data.data(),
                     6,

From 36483d68b4cfa7c0c85383bd4d7a9664eb5b235c Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Mon, 27 Feb 2023 14:55:22 +0100
Subject: [PATCH 31/33] Fix openmp version

---
 include/xtensor/xassign.hpp | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 74e307956..0eb601400 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -1177,7 +1177,7 @@ namespace xt
 
                 for (std::size_t i = 0; i < simd_size; ++i)
                 {
-                    res_stepper.template store_simd<simd_type>(fct_stepper.template step_simd<value_type>());
+                    res_stepper.template store_simd(fct_stepper.template step_simd<value_type>());
                 }
                 for (std::size_t i = 0; i < simd_rest; ++i)
                 {
@@ -1191,13 +1191,24 @@ namespace xt
                     ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
                     : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
 
-                // need to step E1 as well if not contigous assign (e.g. view)
                 fct_stepper.to_begin();
-                res_stepper.to_begin();
-                for (std::size_t i = 0; i < idx.size(); ++i)
+
+                // need to step E1 as well if not contigous assign (e.g. view)
+                if (!E1::contiguous_layout)
                 {
-                    fct_stepper.step(i + step_dim, idx[i]);
-                    res_stepper.step(i + step_dim, idx[i]);
+                    res_stepper.to_begin();
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                        res_stepper.step(i + step_dim, idx[i]);
+                    }
+                }
+                else
+                {
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                    }
                 }
             }
         }

From 2f678c1544378004eb37bc27419c8de2ebd1f645 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Mon, 27 Feb 2023 15:37:03 +0100
Subject: [PATCH 32/33] Try to fix the VS2015 test on appveyor

---
 test/test_strided_assign.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp
index 800764258..e10ba74c2 100644
--- a/test/test_strided_assign.cpp
+++ b/test/test_strided_assign.cpp
@@ -33,10 +33,12 @@ namespace xt
         auto check_strided_assign = [](auto a, auto b)
         {
             assert_compatible_shape(a, b);
+#ifndef _WIN32
             static_assert(
                 xassign_traits<decltype(a), decltype(b)>::strided_assign(),
                 "Failed to do strided assign"
             );
+#endif
             return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign;
         };
         {

From 3bf9d1b688c8665cf543b8f48bb8a6b828df12d9 Mon Sep 17 00:00:00 2001
From: Ewoud <wempe@astro.rug.nl>
Date: Thu, 16 Mar 2023 16:24:14 +0100
Subject: [PATCH 33/33] Remove no longer relevant comment and refactor a few
 lines

---
 include/xtensor/xassign.hpp    | 13 +++++--------
 include/xtensor/xcontainer.hpp |  3 ++-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 0eb601400..9461320a8 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -470,16 +470,13 @@ namespace xt
                 linear_assigner<false>::run(de1, de2);
             }
         }
+        else if (simd_strided_assign)
+        {
+            strided_loop_assigner<simd_strided_assign>::run(de1, de2);
+        }
         else
         {
-            if (simd_strided_assign)
-            {
-                strided_loop_assigner<simd_strided_assign>::run(de1, de2);
-            }
-            else
-            {
-                stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(de1, de2).run();
-            }
+            stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(de1, de2).run();
         }
     }
 
diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp
index 26045f6b6..d4621672d 100644
--- a/include/xtensor/xcontainer.hpp
+++ b/include/xtensor/xcontainer.hpp
@@ -928,11 +928,12 @@ namespace xt
         {
             return i == 0;
         };
-        // This is unsafe if the strides are all 0. Does that happen?
         if (!is_contiguous_container<storage_type>::value)
         {
             return false;
         }
+        // We need to make sure the inner-most non-zero stride is one.
+        // Trailing zero strides are ignored because they indicate bradcasted dimensions.
         if (m_layout == layout_type::row_major)
         {
             auto it = std::find_if_not(m_strides.rbegin(), m_strides.rend(), is_zero);