diff --git a/simon-simd.cpp b/simon-simd.cpp
index e80286e9b..ab908ecaa 100644
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@@ -64,6 +64,24 @@ using CryptoPP::vec_swap;  // SunCC
 
 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
 
+template <class T>
+inline T UnpackHigh32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
+template <class T>
+inline T UnpackLow32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
 template <unsigned int R>
 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
 {
@@ -114,16 +132,6 @@ inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
 }
 #endif
 
-inline uint32x4_t Shuffle32(const uint32x4_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u32_u8(
-        vrev32q_u8(vreinterpretq_u8_u32(val)));
-#else
-    return val;
-#endif
-}
-
 inline uint32x4_t SIMON64_f(const uint32x4_t& val)
 {
     return veorq_u32(RotateLeft32<2>(val),
@@ -133,15 +141,13 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val)
 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
 
     for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
     {
@@ -160,25 +166,21 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
         std::swap(x1, y1);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }
 
 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
 
     if (rounds & 1)
     {
@@ -198,32 +200,26 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
         y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }
 
 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
     uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
 
     for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
     {
@@ -248,38 +244,30 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
         std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }
 
 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
     uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
 
     if (rounds & 1)
     {
@@ -305,17 +293,13 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
         y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }
 
 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@@ -388,16 +372,6 @@ inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
 }
 #endif
 
-inline uint64x2_t Shuffle64(const uint64x2_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u64_u8(
-        vrev64q_u8(vreinterpretq_u8_u64(val)));
-#else
-    return val;
-#endif
-}
-
 inline uint64x2_t SIMON128_f(const uint64x2_t& val)
 {
     return veorq_u64(RotateLeft64<2>(val),
@@ -407,14 +381,12 @@ inline uint64x2_t SIMON128_f(const uint64x2_t& val)
 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
 
     for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
     {
@@ -433,30 +405,25 @@ inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
         std::swap(x1, y1);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }
 
 inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);
 
     for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
     {
@@ -481,29 +448,24 @@ inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
         std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }
 
 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
 
     if (rounds & 1)
     {
@@ -523,30 +485,25 @@ inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
         y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }
 
 inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);
 
     if (rounds & 1)
     {
@@ -572,16 +529,13 @@ inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
         y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }
 
 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@@ -670,8 +624,8 @@ inline __m128i SIMON128_f(const __m128i& v)
 inline void GCC_NO_UBSAN SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -706,8 +660,8 @@ inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -754,8 +708,8 @@ inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -791,8 +745,8 @@ inline void GCC_NO_UBSAN SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -881,8 +835,8 @@ inline __m128i SIMON64_f(const __m128i& v)
 inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@@ -916,8 +870,8 @@ inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@@ -953,8 +907,8 @@ inline void GCC_NO_UBSAN SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@@ -1009,8 +963,8 @@ inline void GCC_NO_UBSAN SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
diff --git a/simon.h b/simon.h
index 19f52e3c6..3ecd0318e 100644
--- a/simon.h
+++ b/simon.h
@@ -17,11 +17,11 @@
 #include "seckey.h"
 #include "secblock.h"
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
 #endif
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS 1
 #endif
 
diff --git a/speck-simd.cpp b/speck-simd.cpp
index 6a14afcdc..58d680dd1 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -61,6 +61,24 @@ using CryptoPP::word64;
 
 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
 
+template <class T>
+inline T UnpackHigh32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_high_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_high_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
+template <class T>
+inline T UnpackLow32(const T& a, const T& b)
+{
+    const uint32x2_t x(vget_low_u32((uint32x4_t)a));
+    const uint32x2_t y(vget_low_u32((uint32x4_t)b));
+    const uint32x2x2_t r = vzip_u32(x, y);
+    return (T)vcombine_u32(r.val[0], r.val[1]);
+}
+
 template <unsigned int R>
 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
 {
@@ -111,27 +129,15 @@ inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
 }
 #endif  // Aarch32 or Aarch64
 
-inline uint32x4_t Shuffle32(const uint32x4_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u32_u8(
-        vrev32q_u8(vreinterpretq_u8_u32(val)));
-#else
-    return val;
-#endif
-}
-
 inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
 
     for (int i=0; i < static_cast<int>(rounds); ++i)
     {
@@ -144,24 +150,20 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
         y1 = veorq_u32(y1, x1);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }
 
 inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
 
     for (int i = static_cast<int>(rounds-1); i >= 0; --i)
     {
@@ -174,32 +176,26 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
         x1 = RotateLeft32<8>(x1);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
 }
 
 inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
     uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
 
     for (int i=0; i < static_cast<int>(rounds); ++i)
     {
@@ -222,38 +218,30 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
         y3 = veorq_u32(y3, x3);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }
 
 inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
     uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
-    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
-    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
-    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
-    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
-    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
-    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
-
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
 
     for (int i = static_cast<int>(rounds-1); i >= 0; --i)
     {
@@ -276,17 +264,13 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
         x3 = RotateLeft32<8>(x3);
     }
 
-    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
-    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
-    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
-
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
-    block0 = vzipq_u32(x1, y1).val[0];
-    block1 = vzipq_u32(x1, y1).val[1];
-    block2 = vzipq_u32(x2, y2).val[0];
-    block3 = vzipq_u32(x2, y2).val[1];
-    block4 = vzipq_u32(x3, y3).val[0];
-    block5 = vzipq_u32(x3, y3).val[1];
+    block0 = UnpackLow32(y1, x1);
+    block1 = UnpackHigh32(y1, x1);
+    block2 = UnpackLow32(y2, x2);
+    block3 = UnpackHigh32(y2, x2);
+    block4 = UnpackLow32(y3, x3);
+    block5 = UnpackHigh32(y3, x3);
 }
 
 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@@ -359,27 +343,15 @@ inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
 }
 #endif
 
-inline uint64x2_t Shuffle64(const uint64x2_t& val)
-{
-#if defined(CRYPTOPP_LITTLE_ENDIAN)
-    return vreinterpretq_u64_u8(
-        vrev64q_u8(vreinterpretq_u8_u64(val)));
-#else
-    return val;
-#endif
-}
-
 inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
 
     for (int i=0; i < static_cast<int>(rounds); ++i)
     {
@@ -392,31 +364,25 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
         y1 = veorq_u64(y1, x1);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
     // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }
 
 inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);
 
     for (int i=0; i < static_cast<int>(rounds); ++i)
     {
@@ -439,30 +405,24 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
         y3 = veorq_u64(y3, x3);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
     // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }
 
 inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
 
     for (int i = static_cast<int>(rounds-1); i >= 0; --i)
     {
@@ -475,31 +435,25 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
         x1 = RotateLeft64<8>(x1);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-
     // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
 }
 
 inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
-    uint64x2_t x1 = UnpackLow64(block0, block1);
-    uint64x2_t y1 = UnpackHigh64(block0, block1);
-    uint64x2_t x2 = UnpackLow64(block2, block3);
-    uint64x2_t y2 = UnpackHigh64(block2, block3);
-    uint64x2_t x3 = UnpackLow64(block4, block5);
-    uint64x2_t y3 = UnpackHigh64(block4, block5);
-
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+    uint64x2_t x1 = UnpackHigh64(block0, block1);
+    uint64x2_t y1 = UnpackLow64(block0, block1);
+    uint64x2_t x2 = UnpackHigh64(block2, block3);
+    uint64x2_t y2 = UnpackLow64(block2, block3);
+    uint64x2_t x3 = UnpackHigh64(block4, block5);
+    uint64x2_t y3 = UnpackLow64(block4, block5);
 
     for (int i = static_cast<int>(rounds-1); i >= 0; --i)
     {
@@ -522,17 +476,13 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
         x3 = RotateLeft64<8>(x3);
     }
 
-    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
-    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
-    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
-
     // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
-    block0 = UnpackLow64(x1, y1);
-    block1 = UnpackHigh64(x1, y1);
-    block2 = UnpackLow64(x2, y2);
-    block3 = UnpackHigh64(x2, y2);
-    block4 = UnpackLow64(x3, y3);
-    block5 = UnpackHigh64(x3, y3);
+    block0 = UnpackLow64(y1, x1);
+    block1 = UnpackHigh64(y1, x1);
+    block2 = UnpackLow64(y2, x2);
+    block3 = UnpackHigh64(y2, x2);
+    block4 = UnpackLow64(y3, x3);
+    block5 = UnpackHigh64(y3, x3);
 }
 
 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
@@ -605,8 +555,8 @@ inline __m128i RotateRight64<8>(const __m128i& val)
 inline void GCC_NO_UBSAN SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -633,8 +583,8 @@ inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -678,8 +628,8 @@ inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -706,8 +656,8 @@ inline void GCC_NO_UBSAN SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
@@ -785,8 +735,8 @@ inline __m128i RotateRight32<8>(const __m128i& val)
 inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@@ -815,8 +765,8 @@ inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
 inline void GCC_NO_UBSAN SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@@ -846,8 +796,8 @@ inline void GCC_NO_UBSAN SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
@@ -901,8 +851,8 @@ inline void GCC_NO_UBSAN SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read from
-    // a big-endian byte array. Depending on the number of blocks it needs to
+    // Rearrange the data for vectorization. The incoming data was read into
+    // a little-endian word array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
     // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
diff --git a/speck.h b/speck.h
index 44847e9bf..e0c4051f0 100644
--- a/speck.h
+++ b/speck.h
@@ -17,11 +17,11 @@
 #include "seckey.h"
 #include "secblock.h"
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
 #endif
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS 1
 #endif