diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 51228749f..412d03644 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -134,7 +134,16 @@ set(XTENSOR_BENCHMARK
 
 set(XTENSOR_BENCHMARK_TARGET benchmark_xtensor)
 add_executable(${XTENSOR_BENCHMARK_TARGET} EXCLUDE_FROM_ALL ${XTENSOR_BENCHMARK} ${XTENSOR_HEADERS})
-target_link_libraries(${XTENSOR_BENCHMARK_TARGET} xtensor ${GBENCHMARK_LIBRARIES})
+target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC xtensor ${GBENCHMARK_LIBRARIES})
+
+if(XTENSOR_USE_TBB)
+    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_USE_TBB)
+    target_include_directories(${XTENSOR_BENCHMARK_TARGET} PUBLIC ${TBB_INCLUDE_DIRS})
+    target_link_libraries(${XTENSOR_BENCHMARK_TARGET} PUBLIC ${TBB_LIBRARIES})
+endif()
+if(XTENSOR_USE_OPENMP)
+    target_compile_definitions(${XTENSOR_BENCHMARK_TARGET} PUBLIC XTENSOR_USE_OPENMP)
+endif()
 
 add_custom_target(xbenchmark
     COMMAND benchmark_xtensor
diff --git a/benchmark/benchmark_views.cpp b/benchmark/benchmark_views.cpp
index e0f9067d3..db233ec05 100644
--- a/benchmark/benchmark_views.cpp
+++ b/benchmark/benchmark_views.cpp
@@ -195,6 +195,76 @@ namespace xt
         BENCHMARK_TEMPLATE(view_assign_strided_view_noalias, float);
     }
 
+    namespace finite_diff
+    {
+        inline auto stencil_threedirections(benchmark::State& state, size_t size)
+        {
+            for (auto _ : state)
+            {
+                const std::array<size_t, 3> shape = {size, size, size};
+                xt::xtensor<double, 3> a(shape), b(shape);
+                auto core = xt::range(1, size - 1);
+                xt::noalias(xt::view(b, core, core, core)
+                ) = 1.0 / 7.0
+                    * (xt::view(a, core, core, core) + xt::view(a, core, core, xt::range(2, size))
+                       + xt::view(a, core, core, xt::range(0, size - 2))
+                       + xt::view(a, core, xt::range(2, size), core)
+                       + xt::view(a, core, xt::range(0, size - 2), core)
+                       + xt::view(a, xt::range(2, size), core, core)
+                       + xt::view(a, xt::range(0, size - 2), core, core));
+                benchmark::DoNotOptimize(b);
+            }
+        }
+
+        inline auto stencil_twodirections(benchmark::State& state, size_t size)
+        {
+            for (auto _ : state)
+            {
+                const std::array<size_t, 3> shape = {size, size, size};
+                xt::xtensor<double, 3> a(shape), b(shape);
+                auto core = xt::range(1, size - 1);
+                xt::noalias(xt::view(b, core, core, core)
+                ) = 1.0 / 7.0
+                    * (xt::view(a, core, core, core) + xt::view(a, core, xt::range(2, size), core)
+                       + xt::view(a, core, xt::range(0, size - 2), core)
+                       + xt::view(a, xt::range(2, size), core, core)
+                       + xt::view(a, xt::range(0, size - 2), core, core));
+                benchmark::DoNotOptimize(b);
+            }
+        }
+
+        inline auto stencil_onedirection(benchmark::State& state, size_t size)
+        {
+            for (auto _ : state)
+            {
+                const std::array<size_t, 3> shape = {size, size, size};
+                xt::xtensor<double, 3> a(shape), b(shape);
+                auto core = xt::range(1, size - 1);
+                xt::noalias(xt::view(b, core, core, core)
+                ) = 1.0 / 2.0
+                    * (xt::view(a, xt::range(2, size), core, core)
+                       - xt::view(a, xt::range(0, size - 2), core, core));
+                benchmark::DoNotOptimize(b);
+            }
+        }
+
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_50, 50);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_100, 100);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_200, 200);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_300, 300);
+        BENCHMARK_CAPTURE(stencil_threedirections, stencil_threedirections_500, 500);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_50, 50);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_100, 100);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_200, 200);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_300, 300);
+        BENCHMARK_CAPTURE(stencil_twodirections, stencil_twodirections_500, 500);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_50, 50);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_100, 100);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_200, 200);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_300, 300);
+        BENCHMARK_CAPTURE(stencil_onedirection, stencil_onedirections_500, 500);
+    }
+
     namespace stridedview
     {
 
diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 1c5535113..9461320a8 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -172,11 +172,30 @@ namespace xt
      * strided_loop_assigner *
      *************************/
 
+    namespace strided_assign_detail
+    {
+        struct loop_sizes_t
+        {
+            bool can_do_strided_assign;
+            bool is_row_major;
+            std::size_t inner_loop_size;
+            std::size_t outer_loop_size;
+            std::size_t cut;
+            std::size_t dimension;
+        };
+    }
+
     template <bool simd>
     class strided_loop_assigner
     {
     public:
 
+        using loop_sizes_t = strided_assign_detail::loop_sizes_t;
+        // is_row_major, inner_loop_size, outer_loop_size, cut
+        template <class E1, class E2>
+        static void run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes);
+        template <class E1, class E2>
+        static loop_sizes_t get_loop_sizes(E1& e1, const E2& e2);
         template <class E1, class E2>
         static void run(E1& e1, const E2& e2);
     };
@@ -254,7 +273,7 @@ namespace xt
             -> std::enable_if_t<has_strides<E1>::value, bool>
         {
             return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>())
-                   || (e1.layout() != layout_type::dynamic && e2.has_linear_assign(e1.strides()));
+                   || (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
         }
 
         template <class E1, class E2>
@@ -457,8 +476,7 @@ namespace xt
         }
         else
         {
-            stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)> assigner(de1, de2);
-            assigner.run();
+            stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(de1, de2).run();
         }
     }
 
@@ -713,6 +731,7 @@ namespace xt
 #if defined(XTENSOR_USE_TBB)
         if (size >= XTENSOR_TBB_THRESHOLD)
         {
+            tbb::static_partitioner ap;
             tbb::parallel_for(
                 align_begin,
                 align_end,
@@ -723,7 +742,8 @@ namespace xt
                         i,
                         e2.template load_simd<rhs_align_mode, value_type>(i)
                     );
-                }
+                },
+                ap
             );
         }
         else
@@ -734,7 +754,7 @@ namespace xt
             }
         }
 #elif defined(XTENSOR_USE_OPENMP)
-        if (size >= XTENSOR_OPENMP_TRESHOLD)
+        if (size >= size_type(XTENSOR_OPENMP_TRESHOLD))
         {
 #pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
 #ifndef _WIN32
@@ -789,21 +809,34 @@ namespace xt
         auto src = linear_begin(e2);
         auto dst = linear_begin(e1);
         size_type n = e1.size();
-
 #if defined(XTENSOR_USE_TBB)
+        tbb::static_partitioner sp;
         tbb::parallel_for(
             std::ptrdiff_t(0),
             static_cast<std::ptrdiff_t>(n),
             [&](std::ptrdiff_t i)
             {
                 *(dst + i) = static_cast<value_type>(*(src + i));
-            }
+            },
+            sp
         );
 #elif defined(XTENSOR_USE_OPENMP)
+        if (n >= XTENSOR_OPENMP_TRESHOLD)
+        {
 #pragma omp parallel for default(none) shared(src, dst, n)
-        for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n); i++)
+            for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n); i++)
+            {
+                *(dst + i) = static_cast<value_type>(*(src + i));
+            }
+        }
+        else
         {
-            *(dst + i) = static_cast<value_type>(*(src + i));
+            for (; n > size_type(0); --n)
+            {
+                *dst = static_cast<value_type>(*src);
+                ++src;
+                ++dst;
+            }
         }
 #else
         for (; n > size_type(0); --n)
@@ -850,6 +883,27 @@ namespace xt
                     }
                 }
             }
+
+            template <class T>
+            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
+            {
+                dynamic_shape<std::size_t> stride_sizes;
+                xt::resize_container(stride_sizes, outer_shape.size());
+                // compute strides
+                using size_type = typename T::size_type;
+                for (size_type i = outer_shape.size(); i > 0; i--)
+                {
+                    stride_sizes[i - 1] = (i == outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
+                }
+
+                // compute index
+                for (size_type i = 0; i < outer_shape.size(); i++)
+                {
+                    auto d_idx = n / stride_sizes[i];
+                    outer_index[i] = d_idx;
+                    n -= d_idx * stride_sizes[i];
+                }
+            }
         };
 
         template <>
@@ -874,6 +928,30 @@ namespace xt
                     }
                 }
             }
+
+            template <class T>
+            static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
+            {
+                dynamic_shape<std::size_t> stride_sizes;
+                xt::resize_container(stride_sizes, outer_shape.size());
+
+                using size_type = typename T::size_type;
+
+                // compute required strides
+                for (size_type i = 0; i < outer_shape.size(); i++)
+                {
+                    stride_sizes[i] = (i == 0) ? 1 : stride_sizes[i - 1] * outer_shape[i - 1];
+                }
+
+                // compute index
+                for (size_type i = outer_shape.size(); i > 0;)
+                {
+                    i--;
+                    auto d_idx = n / stride_sizes[i];
+                    outer_index[i] = d_idx;
+                    n -= d_idx * stride_sizes[i];
+                }
+            }
         };
 
         template <layout_type L, class S>
@@ -890,6 +968,7 @@ namespace xt
             template <class T, layout_type LE = L>
             std::enable_if_t<LE == layout_type::row_major, std::size_t> operator()(const T& el)
             {
+                // All dimenions less than var have differing strides
                 auto var = check_strides_overlap<layout_type::row_major>::get(m_strides, el.strides());
                 if (var > m_cut)
                 {
@@ -902,6 +981,7 @@ namespace xt
             std::enable_if_t<LE == layout_type::column_major, std::size_t> operator()(const T& el)
             {
                 auto var = check_strides_overlap<layout_type::column_major>::get(m_strides, el.strides());
+                // All dimensions >= var have differing strides
                 if (var < m_cut)
                 {
                     m_cut = var;
@@ -928,25 +1008,70 @@ namespace xt
             const strides_type& m_strides;
         };
 
-        template <class E1, class E2>
-        auto get_loop_sizes(const E1& e1, const E2& e2, bool is_row_major)
+        template <bool possible = true, class E1, class E2, std::enable_if_t<!has_strides<E1>::value || !possible, bool> = true>
+        loop_sizes_t get_loop_sizes(const E1& e1, const E2&)
         {
+            return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
+        }
+
+        template <bool possible = true, class E1, class E2, std::enable_if_t<has_strides<E1>::value && possible, bool> = true>
+        loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2)
+        {
+            using shape_value_type = typename E1::shape_type::value_type;
+            bool is_row_major = true;
+
+            // Try to find a row-major scheme first, where the outer loop is on the first N = `cut`
+            // dimensions, and the inner loop is
+            is_row_major = true;
+            auto is_zero = [](auto i)
+            {
+                return i == 0;
+            };
+            auto&& strides = e1.strides();
+            auto it_bwd = std::find_if_not(strides.rbegin(), strides.rend(), is_zero);
+            bool de1_row_contiguous = it_bwd != strides.rend() && *it_bwd == 1;
+            auto it_fwd = std::find_if_not(strides.begin(), strides.end(), is_zero);
+            bool de1_col_contiguous = it_fwd != strides.end() && *it_fwd == 1;
+            if (de1_row_contiguous)
+            {
+                is_row_major = true;
+            }
+            else if (de1_col_contiguous)
+            {
+                is_row_major = false;
+            }
+            else
+            {
+                // No strided loop possible.
+                return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
+            }
+
+            // Cut is the number of dimensions in the outer loop
             std::size_t cut = 0;
 
-            // TODO! if E1 is !contiguous --> initialize cut to sensible value!
-            if (E1::static_layout == layout_type::row_major || is_row_major)
+            if (is_row_major)
             {
                 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
                 cut = csf(e2);
+                // This makes that only one dimension will be treated in the inner loop.
+                if (cut < e1.strides().size() - 1)
+                {
+                    // Only make the inner loop go over one dimension by default for now
+                    cut = e1.strides().size() - 1;
+                }
             }
-            else if (E1::static_layout == layout_type::column_major || !is_row_major)
+            else if (!is_row_major)
             {
                 auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
                 );
                 cut = csf(e2);
+                if (cut > 1)
+                {
+                    // Only make the inner loop go over one dimension by default for now
+                    cut = 1;
+                }
             }  // can't reach here because this would have already triggered the fallback
 
-            using shape_value_type = typename E1::shape_type::value_type;
             std::size_t outer_loop_size = static_cast<std::size_t>(std::accumulate(
                 e1.shape().begin(),
                 e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
@@ -960,61 +1085,33 @@ namespace xt
                 std::multiplies<shape_value_type>{}
             ));
 
-            if (E1::static_layout == layout_type::column_major || !is_row_major)
+            if (!is_row_major)
             {
                 std::swap(outer_loop_size, inner_loop_size);
             }
 
-            return std::make_tuple(inner_loop_size, outer_loop_size, cut);
+            return {inner_loop_size > 1, is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()};
         }
     }
 
     template <bool simd>
     template <class E1, class E2>
-    inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2)
+    inline strided_assign_detail::loop_sizes_t strided_loop_assigner<simd>::get_loop_sizes(E1& e1, const E2& e2)
     {
-        bool is_row_major = true;
-        using fallback_assigner = stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>;
+        return strided_assign_detail::get_loop_sizes<simd>(e1, e2);
+    }
 
-        if (E1::static_layout == layout_type::dynamic)
-        {
-            layout_type dynamic_layout = e1.layout();
-            switch (dynamic_layout)
-            {
-                case layout_type::row_major:
-                    is_row_major = true;
-                    break;
-                case layout_type::column_major:
-                    is_row_major = false;
-                    break;
-                default:
-                    return fallback_assigner(e1, e2).run();
-            }
-        }
-        else if (E1::static_layout == layout_type::row_major)
-        {
-            is_row_major = true;
-        }
-        else if (E1::static_layout == layout_type::column_major)
-        {
-            is_row_major = false;
-        }
-        else
-        {
-            XTENSOR_THROW(std::runtime_error, "Illegal layout set (layout_type::any?).");
-        }
+#define strided_parallel_assign
 
-        std::size_t inner_loop_size, outer_loop_size, cut;
-        std::tie(inner_loop_size, outer_loop_size, cut) = strided_assign_detail::get_loop_sizes(
-            e1,
-            e2,
-            is_row_major
-        );
+    template <bool simd>
+    template <class E1, class E2>
+    inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes)
+    {
+        bool is_row_major = loop_sizes.is_row_major;
+        std::size_t inner_loop_size = loop_sizes.inner_loop_size;
+        std::size_t outer_loop_size = loop_sizes.outer_loop_size;
+        std::size_t cut = loop_sizes.cut;
 
-        if ((is_row_major && cut == e1.dimension()) || (!is_row_major && cut == 0))
-        {
-            return fallback_assigner(e1, e2).run();
-        }
 
         // TODO can we get rid of this and use `shape_type`?
         dynamic_shape<std::size_t> idx, max_shape;
@@ -1054,50 +1151,216 @@ namespace xt
         {
             step_dim = cut;
         }
-
-        for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
+#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
+        if (outer_loop_size >= XTENSOR_OPENMP_TRESHOLD / inner_loop_size)
         {
-            for (std::size_t i = 0; i < simd_size; ++i)
+            std::size_t first_step = true;
+#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
+            for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
             {
-                res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
-            }
-            for (std::size_t i = 0; i < simd_rest; ++i)
-            {
-                *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
-                res_stepper.step_leading();
-                fct_stepper.step_leading();
-            }
+                if (first_step)
+                {
+                    is_row_major
+                        ? strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx, max_shape)
+                        : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox, idx, max_shape);
 
-            is_row_major
-                ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
-                : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                        res_stepper.step(i + step_dim, idx[i]);
+                    }
+                    first_step = false;
+                }
 
-            fct_stepper.to_begin();
+                for (std::size_t i = 0; i < simd_size; ++i)
+                {
+                    res_stepper.template store_simd(fct_stepper.template step_simd<value_type>());
+                }
+                for (std::size_t i = 0; i < simd_rest; ++i)
+                {
+                    *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+                    res_stepper.step_leading();
+                    fct_stepper.step_leading();
+                }
 
-            // need to step E1 as well if not contigous assign (e.g. view)
-            if (!E1::contiguous_layout)
-            {
-                res_stepper.to_begin();
-                for (std::size_t i = 0; i < idx.size(); ++i)
+                // next unaligned index
+                is_row_major
+                    ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
+                    : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+
+                fct_stepper.to_begin();
+
+                // need to step E1 as well if not contigous assign (e.g. view)
+                if (!E1::contiguous_layout)
                 {
-                    fct_stepper.step(i + step_dim, idx[i]);
-                    res_stepper.step(i + step_dim, idx[i]);
+                    res_stepper.to_begin();
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                        res_stepper.step(i + step_dim, idx[i]);
+                    }
+                }
+                else
+                {
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                    }
                 }
             }
-            else
+        }
+        else
+        {
+#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
+        if (outer_loop_size > XTENSOR_TBB_THRESHOLD / inner_loop_size)
+        {
+            tbb::static_partitioner sp;
+            tbb::parallel_for(
+                tbb::blocked_range<size_t>(0ul, outer_loop_size),
+                [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_ = idx](
+                    const tbb::blocked_range<size_t>& r
+                )
+                {
+                    auto idx = idx_;
+                    auto fct_stepper = e2.stepper_begin(e1.shape());
+                    auto res_stepper = e1.stepper_begin(e1.shape());
+                    std::size_t first_step = true;
+                    // #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper,
+                    // res_stepper, idx)
+                    for (std::size_t ox = r.begin(); ox < r.end(); ++ox)
+                    {
+                        if (first_step)
+                        {
+                            is_row_major
+                                ? strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx, max_shape)
+                                : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(
+                                    ox,
+                                    idx,
+                                    max_shape
+                                );
+
+                            for (std::size_t i = 0; i < idx.size(); ++i)
+                            {
+                                fct_stepper.step(i + step_dim, idx[i]);
+                                res_stepper.step(i + step_dim, idx[i]);
+                            }
+                            first_step = false;
+                        }
+
+                        for (std::size_t i = 0; i < simd_size; ++i)
+                        {
+                            res_stepper.template store_simd(fct_stepper.template step_simd<value_type>());
+                        }
+                        for (std::size_t i = 0; i < simd_rest; ++i)
+                        {
+                            *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+                            res_stepper.step_leading();
+                            fct_stepper.step_leading();
+                        }
+
+                        // next unaligned index
+                        is_row_major
+                            ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
+                            : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+
+                        fct_stepper.to_begin();
+
+                        // need to step E1 as well if not contigous assign (e.g. view)
+                        if (!E1::contiguous_layout)
+                        {
+                            res_stepper.to_begin();
+                            for (std::size_t i = 0; i < idx.size(); ++i)
+                            {
+                                fct_stepper.step(i + step_dim, idx[i]);
+                                res_stepper.step(i + step_dim, idx[i]);
+                            }
+                        }
+                        else
+                        {
+                            for (std::size_t i = 0; i < idx.size(); ++i)
+                            {
+                                fct_stepper.step(i + step_dim, idx[i]);
+                            }
+                        }
+                    }
+                },
+                sp
+            );
+        }
+        else
+        {
+
+#endif
+            for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
             {
-                for (std::size_t i = 0; i < idx.size(); ++i)
+                for (std::size_t i = 0; i < simd_size; ++i)
+                {
+                    res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
+                }
+                for (std::size_t i = 0; i < simd_rest; ++i)
+                {
+                    *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
+                    res_stepper.step_leading();
+                    fct_stepper.step_leading();
+                }
+
+                is_row_major
+                    ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
+                    : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
+
+                fct_stepper.to_begin();
+
+                // need to step E1 as well if not contigous assign (e.g. view)
+                if (!E1::contiguous_layout)
+                {
+                    res_stepper.to_begin();
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                        res_stepper.step(i + step_dim, idx[i]);
+                    }
+                }
+                else
                 {
-                    fct_stepper.step(i + step_dim, idx[i]);
+                    for (std::size_t i = 0; i < idx.size(); ++i)
+                    {
+                        fct_stepper.step(i + step_dim, idx[i]);
+                    }
                 }
             }
+#if (defined(XTENSOR_USE_OPENMP) || defined(XTENSOR_USE_TBB)) && defined(strided_parallel_assign)
+        }
+#endif
+    }
+
+    template <>
+    template <class E1, class E2>
+    inline void strided_loop_assigner<true>::run(E1& e1, const E2& e2)
+    {
+        strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner<true>::get_loop_sizes(e1, e2);
+        if (loop_sizes.can_do_strided_assign)
+        {
+            run(e1, e2, loop_sizes);
         }
+        else
+        {
+            // trigger the fallback assigner
+            stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(e1, e2).run();
+        }
+    }
+
+    template <>
+    template <class E1, class E2>
+    inline void strided_loop_assigner<false>::run(E1& /*e1*/, const E2& /*e2*/, const loop_sizes_t&)
+    {
     }
 
     template <>
     template <class E1, class E2>
-    inline void strided_loop_assigner<false>::run(E1& /*e1*/, const E2& /*e2*/)
+    inline void strided_loop_assigner<false>::run(E1& e1, const E2& e2)
     {
+        // trigger the fallback assigner
+        stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(e1, e2).run();
     }
 }
 
diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp
index f5a79d3d7..d4621672d 100644
--- a/include/xtensor/xcontainer.hpp
+++ b/include/xtensor/xcontainer.hpp
@@ -924,9 +924,32 @@ namespace xt
     inline bool xstrided_container<D>::is_contiguous() const noexcept
     {
         using str_type = typename inner_strides_type::value_type;
-        return is_contiguous_container<storage_type>::value
-               && (m_strides.empty() || (m_layout == layout_type::row_major && m_strides.back() == str_type(1))
-                   || (m_layout == layout_type::column_major && m_strides.front() == str_type(1)));
+        auto is_zero = [](auto i)
+        {
+            return i == 0;
+        };
+        if (!is_contiguous_container<storage_type>::value)
+        {
+            return false;
+        }
+        // We need to make sure the inner-most non-zero stride is one.
+        // Trailing zero strides are ignored because they indicate bradcasted dimensions.
+        if (m_layout == layout_type::row_major)
+        {
+            auto it = std::find_if_not(m_strides.rbegin(), m_strides.rend(), is_zero);
+            // If the array has strides of zero, it is a constant, and therefore contiguous.
+            return it == m_strides.rend() || *it == str_type(1);
+        }
+        else if (m_layout == layout_type::column_major)
+        {
+            auto it = std::find_if_not(m_strides.begin(), m_strides.end(), is_zero);
+            // If the array has strides of zero, it is a constant, and therefore contiguous.
+            return it == m_strides.end() || *it == str_type(1);
+        }
+        else
+        {
+            return m_strides.empty();
+        }
     }
 
     namespace detail
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7f696e6a5..0ce881944 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -173,6 +173,7 @@ set(XTENSOR_TESTS
     main.cpp
     test_xaccumulator.cpp
     test_xadapt.cpp
+    test_strided_assign.cpp
     test_xassign.cpp
     test_xaxis_iterator.cpp
     test_xaxis_slice_iterator.cpp
diff --git a/test/test_strided_assign.cpp b/test/test_strided_assign.cpp
new file mode 100644
index 000000000..e10ba74c2
--- /dev/null
+++ b/test/test_strided_assign.cpp
@@ -0,0 +1,197 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
+ * Copyright (c) QuantStack                                                 *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include <type_traits>
+#include <vector>
+
+#include "xtensor/xadapt.hpp"
+#include "xtensor/xarray.hpp"
+#include "xtensor/xassign.hpp"
+#include "xtensor/xlayout.hpp"
+#include "xtensor/xnoalias.hpp"
+#include "xtensor/xtensor.hpp"
+#include "xtensor/xview.hpp"
+
+#include "test_common.hpp"
+#include "test_common_macros.hpp"
+
+namespace xt
+{
+    TEST(xassign_strided, mix_shape_types)
+    {
+        auto check_linear_assign = [](auto a, auto b)
+        {
+            assert_compatible_shape(a, b);
+            return xassign_traits<decltype(a), decltype(b)>::linear_assign(a, b, true);
+        };
+        auto check_strided_assign = [](auto a, auto b)
+        {
+            assert_compatible_shape(a, b);
+#ifndef _WIN32
+            static_assert(
+                xassign_traits<decltype(a), decltype(b)>::strided_assign(),
+                "Failed to do strided assign"
+            );
+#endif
+            return strided_assign_detail::get_loop_sizes(a, b).can_do_strided_assign;
+        };
+        {
+            size_t size = 50;
+            const std::array<size_t, 3> shape = {size, size, size};
+            xt::xtensor<double, 3> a(shape), b(shape);
+            auto core = xt::range(1, size - 1);
+            auto lhs = xt::view(b, core, core, core);
+            auto rhs = 1.0 / 7.0
+                       * (xt::view(a, core, core, core) + xt::view(a, core, xt::range(2, size), core)
+                          + xt::view(a, core, xt::range(0, size - 2), core)
+                          + xt::view(a, xt::range(2, size), core, core)
+                          + xt::view(a, xt::range(0, size - 2), core, core));
+            xt::noalias(lhs) = rhs;
+            EXPECT_TRUE(check_strided_assign(lhs, rhs));
+        }
+        {
+            auto data = std::vector<double>{
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+            auto simple_xtensor_12 = xt::xtensor<double, 2, layout_type::row_major>{
+                {1, 2, 3, 4},
+                {5, 6, 7, 8},
+                {9, 10, 11, 12}};
+            auto simple_xtensor_16 = xt::xtensor<double, 2, layout_type::row_major>{
+                {1, 2, 3, 4},
+                {5, 6, 7, 8},
+                {9, 10, 11, 12},
+                {13, 14, 15, 16}};
+            auto adapter_strided_noncont = xt::adapt(
+                data.data(),
+                12,
+                xt::no_ownership(),
+                std::vector<size_t>{4, 3},
+                std::vector<size_t>{4, 1}
+            );
+            auto adapter_strided_cont = xt::adapt(
+                data.data() + 1,
+                12,
+                xt::no_ownership(),
+                std::vector<size_t>{3, 4},
+                std::vector<size_t>{4, 1}
+            );
+            auto adapter_strided_noncont_43 = xt::adapt(
+                data.data(),
+                12,
+                xt::no_ownership(),
+                std::vector<size_t>{3, 4},
+                std::vector<size_t>{5, 1}
+            );
+            EXPECT_FALSE(check_strided_assign(adapter_strided_noncont, xt::transpose(adapter_strided_cont)));
+            auto contiguous_view = xt::view(simple_xtensor_12, xt::range(0, 3), xt::all());
+            EXPECT_TRUE(check_linear_assign(contiguous_view, adapter_strided_cont));
+            EXPECT_FALSE(check_linear_assign(contiguous_view, xt::transpose(adapter_strided_noncont)));
+            EXPECT_FALSE(check_linear_assign(xt::transpose(adapter_strided_noncont), contiguous_view));
+            EXPECT_FALSE(check_linear_assign(contiguous_view, adapter_strided_noncont_43));
+            EXPECT_FALSE(check_linear_assign(adapter_strided_noncont_43, contiguous_view));
+            EXPECT_TRUE(check_strided_assign(contiguous_view, adapter_strided_noncont_43));
+            EXPECT_TRUE(check_strided_assign(adapter_strided_noncont_43, contiguous_view));
+            auto contiguous_view2 = xt::view(simple_xtensor_12, xt::range(1, 3), xt::all());
+            auto contiguous_view3 = xt::view(simple_xtensor_12, xt::range(0, 2), xt::all());
+            EXPECT_TRUE(check_linear_assign(contiguous_view2, contiguous_view3));
+            {
+                auto view_noncont = xt::view(simple_xtensor_16, xt::all(), xt::range(0, 3));
+                EXPECT_FALSE(check_linear_assign(adapter_strided_noncont, view_noncont));
+                EXPECT_TRUE(check_strided_assign(adapter_strided_noncont, view_noncont));
+            }
+            {
+                auto view_cont = xt::view(simple_xtensor_16, xt::range(0, 1), xt::range(0, 4));
+                auto view_cont2 = xt::view(simple_xtensor_16, xt::range(3, 4), xt::range(0, 4));
+                EXPECT_TRUE(check_linear_assign(view_cont, view_cont2));
+                auto view_noncont = xt::view(simple_xtensor_16, xt::range(0, 4), xt::range(0, 3));
+                auto view_noncont2 = xt::view(simple_xtensor_16, xt::range(0, 4), xt::range(1, 4));
+                EXPECT_FALSE(check_linear_assign(view_noncont, view_noncont2));
+                EXPECT_TRUE(check_strided_assign(view_noncont, view_noncont2));
+            }
+
+            {
+                std::vector<double> data2{-1, -1, -1, -1};
+                auto linear_adapter = xt::adapt<layout_type::row_major>(
+                    data2.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{4, 1}
+                );
+                auto adapter_cont2 = xt::adapt<layout_type::row_major>(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{1, 4}
+                );
+                EXPECT_TRUE(linear_adapter.is_contiguous());
+                EXPECT_TRUE(adapter_cont2.is_contiguous());
+                bool success_one = check_linear_assign(linear_adapter, xt::transpose(adapter_cont2));
+                bool success_two = check_linear_assign(adapter_cont2, xt::transpose(linear_adapter));
+                EXPECT_TRUE(success_one && success_two);
+                auto adapter_noncont_singlecol = xt::adapt(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{4, 1},
+                    std::vector<size_t>{4, 1}
+                );
+                EXPECT_FALSE(check_linear_assign(adapter_noncont_singlecol, linear_adapter));
+                EXPECT_FALSE(check_linear_assign(linear_adapter, adapter_noncont_singlecol));
+                EXPECT_FALSE(check_strided_assign(adapter_noncont_singlecol, linear_adapter));
+                EXPECT_FALSE(check_strided_assign(linear_adapter, adapter_noncont_singlecol));
+                auto adapter_noncont_twocol = xt::adapt(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{2, 2},
+                    std::vector<size_t>{4, 1}
+                );
+                EXPECT_FALSE(check_linear_assign(adapter_noncont_twocol, linear_adapter.reshape({2, 2})));
+                EXPECT_FALSE(check_linear_assign(linear_adapter.reshape({2, 2}), adapter_noncont_twocol));
+                EXPECT_TRUE(check_strided_assign(adapter_noncont_twocol, linear_adapter.reshape({2, 2})));
+                EXPECT_TRUE(check_strided_assign(linear_adapter.reshape({2, 2}), adapter_noncont_twocol));
+                auto adapter_zero_strides = xt::adapt(
+                    data.data(),
+                    4,
+                    xt::no_ownership(),
+                    std::vector<size_t>{2, 2},
+                    std::vector<size_t>{0, 0}
+                );
+                EXPECT_FALSE(check_linear_assign(adapter_zero_strides, adapter_zero_strides));
+                EXPECT_FALSE(check_strided_assign(adapter_zero_strides, adapter_zero_strides));
+                EXPECT_FALSE(check_linear_assign(adapter_zero_strides, linear_adapter));
+            }
+
+            {
+                std::vector<double> data2{-1, -1, -1, -1, -1, -1};
+                auto linear_adapter2 = xt::adapt<layout_type::row_major>(
+                    data2.data(),
+                    6,
+                    xt::no_ownership(),
+                    std::vector<size_t>{3, 2}
+                );
+                auto strided_adapter = xt::adapt(
+                    data.data(),
+                    6,
+                    xt::no_ownership(),
+                    std::vector<size_t>{3, 2},
+                    std::vector<size_t>{4, 1}
+                );
+                EXPECT_FALSE(check_linear_assign(linear_adapter2, strided_adapter));
+                EXPECT_FALSE(check_linear_assign(strided_adapter, linear_adapter2));
+                EXPECT_TRUE(check_strided_assign(linear_adapter2, strided_adapter));
+                EXPECT_TRUE(check_strided_assign(strided_adapter, linear_adapter2));
+                xt::noalias(strided_adapter) = linear_adapter2;
+                auto result_expected = std::vector<double>{
+                    -1, -1, 3, 4, -1, -1, 7, 8, -1, -1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+                EXPECT_EQ(data, result_expected);
+            }
+        }
+    }
+}
diff --git a/test/test_xassign.cpp b/test/test_xassign.cpp
index 475d0e5cf..5367a32da 100644
--- a/test/test_xassign.cpp
+++ b/test/test_xassign.cpp
@@ -7,6 +7,7 @@
  * The full license is in the file LICENSE, distributed with this software. *
  ****************************************************************************/
 
+#include <iterator>
 #include <type_traits>
 #include <vector>
 
@@ -82,6 +83,16 @@ class my_vector
         return m_data.begin();
     }
 
+    auto rbegin() const
+    {
+        return std::make_reverse_iterator(end());
+    }
+
+    auto rend() const
+    {
+        return std::make_reverse_iterator(begin());
+    }
+
     auto empty() const
     {
         return m_data.empty();